# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Load the dataset from Colab
from google.colab import files
uploaded = files.upload()
Saving Health Dataset5.csv to Health Dataset5.csv
# Read the dataset
df = pd.read_csv('Health Dataset5.csv')
# Summary for general info
print("Summary for general info:")
df.info()
# Summary for descriptive statistics for numeric columns
print("\nSummary for descriptive statistics for numeric columns:")
print(df.describe())
Summary for general info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17645 entries, 0 to 17644
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country 17645 non-null object
1 Year 17645 non-null int64
2 Cost of a healthy diet 17504 non-null float64
3 Income 17525 non-null float64
4 Inflation 17590 non-null float64
5 Child mortality rate 17645 non-null float64
6 Unemployment Rate 17604 non-null float64
7 Life expectancy 17645 non-null float64
8 Incomplete tertiary education 17645 non-null float64
9 Gini coefficient 17525 non-null float64
10 Diabetes 17615 non-null float64
11 BMI (female) 17620 non-null float64
12 Cardiovascular diseases 17595 non-null float64
13 BMI (male) 17620 non-null float64
14 Sex ratio 17645 non-null float64
15 GDP 17595 non-null float64
16 Median age 17645 non-null float64
17 CPI 17591 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 2.4+ MB
Summary for descriptive statistics for numeric columns:
Year Cost of a healthy diet Income Inflation \
count 17645.000000 17504.000000 17525.000000 17590.000000
mean 1986.538339 3.696326 19.513999 16.780677
std 21.362973 0.791526 20.807002 228.759581
min 1950.000000 1.607861 1.007993 -17.640425
25% 1968.000000 3.185103 5.017792 2.936995
50% 1987.000000 3.592499 10.724820 5.186481
75% 2005.000000 4.148040 24.844337 9.362823
max 2023.000000 6.259097 93.327800 23773.130000
Child mortality rate Unemployment Rate Life expectancy \
count 17645.000000 17604.000000 17645.000000
mean 8.423772 7.660362 63.845511
std 8.971160 5.778678 12.042063
min 0.140100 0.100000 10.989100
25% 1.753500 3.472000 56.083600
50% 4.676700 5.806000 66.484300
75% 12.347300 10.997000 72.860000
max 68.864204 38.800000 86.372400
Incomplete tertiary education Gini coefficient Diabetes \
count 17645.000000 17525.000000 17615.000000
mean 18.317813 0.374645 8.918955
std 18.163953 0.084143 4.632843
min 0.000000 0.201866 1.300000
25% 3.600000 0.314517 6.100000
50% 10.500000 0.355643 7.500000
75% 29.800000 0.423811 10.500000
max 78.600000 0.710506 29.800000
BMI (female) Cardiovascular diseases BMI (male) Sex ratio \
count 17620.000000 17595.000000 17620.000000 17645.000000
mean 25.690534 25.070731 24.931733 104.330728
std 3.035388 143.803156 2.788206 3.605895
min 16.399592 0.000928 17.634594 71.428570
25% 23.734100 0.298413 22.647126 102.623130
50% 25.738728 1.456723 25.223253 104.245140
75% 27.310578 5.421141 26.793611 105.633804
max 35.224032 1921.131800 33.556548 200.000000
GDP Median age CPI
count 1.759500e+04 17645.000000 1.759100e+04
mean 2.925977e+12 23.856445 2.113603e+02
std 1.594833e+13 8.100374 1.230694e+03
min 2.625572e+07 12.617000 3.550000e-14
25% 1.665892e+10 17.196000 6.461704e+01
50% 7.804498e+10 20.919000 1.214129e+02
75% 5.070000e+11 29.672000 1.574048e+02
max 1.670000e+14 62.417000 3.879656e+04
# Identify duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")
# Inspect duplicate rows
print(df[df.duplicated(keep=False)])
# Drop rows where all values are NaN (completely blank rows)
#df.dropna(how='all', inplace=True)
Number of duplicate rows: 0 Empty DataFrame Columns: [Country, Year, Cost of a healthy diet, Income, Inflation, Child mortality rate, Unemployment Rate, Life expectancy, Incomplete tertiary education, Gini coefficient, Diabetes, BMI (female), Cardiovascular diseases, BMI (male), Sex ratio, GDP, Median age, CPI] Index: []
Based on the above results, all the duplicates are blank rows (rows where all columns are NaN or empty), and I will drop those blank rows entirely
# Drop rows where all values are NaN (all blank rows)
df.dropna(how='all', inplace=True)
# Drop duplicate rows (keep first occurrence)
df.drop_duplicates(inplace=True)
# Check remaining duplicates
print(f"Duplicates after dropping: {df.duplicated().sum()}")
Duplicates after dropping: 0
After removed the blank rows and verified there is no duplicates in this dataset.
Identify Missing Data¶
# Identify number of missing values per column
print("\nCount of missing values:")
print(df.isnull().sum())
Count of missing values: Country 0 Year 0 Cost of a healthy diet 141 Income 120 Inflation 55 Child mortality rate 0 Unemployment Rate 41 Life expectancy 0 Incomplete tertiary education 0 Gini coefficient 120 Diabetes 30 BMI (female) 25 Cardiovascular diseases 50 BMI (male) 25 Sex ratio 0 GDP 50 Median age 0 CPI 54 dtype: int64
QQ Plot of Residuals, Residuals vs. Fitted Values Plot¶
This plot helps check for the assumptions of linearity and constant variance for a linear regression model.
If curved residual patterns, it indicates that the relationship between predictors and the target is not linear that a linear model may be inappropriate.
The funnel shapes increasing or decreasing spread, which means the variance of the residuals is not constant across all fitted values. This violates one of the key assumptions of linear regression and can lead to inefficient and biased estimates
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats
# List of predictors (make sure column names match exactly in your dataframe)
features = [
'Income', 'GDP', 'CPI', 'Sex ratio',
'BMI (female)', 'Cost of a healthy diet', 'Inflation',
'Incomplete tertiary education', 'Gini coefficient', 'Median age'
]
# Loop through each target variable
for target in ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']:
print(f"\nModeling for: {target}")
# Subset and drop rows with missing values
model_data = df[[target] + features].dropna()
X = model_data[features]
y = model_data[target]
# Add constant (intercept)
X = sm.add_constant(X)
# Fit OLS regression model
model = sm.OLS(y, X).fit()
residuals = model.resid
# --- QQ Plot ---
plt.figure(figsize=(6, 4))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title(f'QQ Plot of Residuals - {target}')
plt.grid(True)
plt.show()
# --- Residuals vs. Fitted Values Plot ---
plt.figure(figsize=(6, 4))
plt.scatter(model.fittedvalues, residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.title(f'Residuals vs Fitted - {target}')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.grid(True)
plt.show()
# --- Residual Summary ---
print("Residuals Summary:")
print(f" Mean: {residuals.mean():.4f}")
print(f" Std Dev: {residuals.std():.4f}")
print(f" Skewness: {residuals.skew():.4f}")
print(f" Kurtosis: {residuals.kurtosis():.4f}")
# --- Shapiro-Wilk Test for Normality ---
shapiro_test = stats.shapiro(residuals)
print(f" Shapiro-Wilk: Statistic={shapiro_test.statistic:.4f}, p-value={shapiro_test.pvalue:.4f}")
if shapiro_test.pvalue > 0.05:
print(" Residuals are approximately normal.")
else:
print(" Residuals deviate from normality.")
Modeling for: Life expectancy
Residuals Summary: Mean: -0.0000 Std Dev: 8.2276 Skewness: -1.0599 Kurtosis: 1.9206 Shapiro-Wilk: Statistic=0.9420, p-value=0.0000 Residuals deviate from normality. Modeling for: Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504. res = hypotest_fun_out(*samples, **kwds)
Residuals Summary: Mean: 0.0001 Std Dev: 120.4087 Skewness: 4.6438 Kurtosis: 59.1114 Shapiro-Wilk: Statistic=0.3134, p-value=0.0000 Residuals deviate from normality. Modeling for: Diabetes
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504. res = hypotest_fun_out(*samples, **kwds)
Residuals Summary: Mean: -0.0000 Std Dev: 3.2707 Skewness: 1.4892 Kurtosis: 5.4749 Shapiro-Wilk: Statistic=0.8798, p-value=0.0000 Residuals deviate from normality.
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504. res = hypotest_fun_out(*samples, **kwds)
The results of the QQ plot and Residual vs Fitted value:
Life Expectancy The residuals for the life expectancy model have a near-zero mean, which is good. However, they exhibit moderate left skew (skewness = -1.059) and slightly lower-than-normal kurtosis (1.92), suggesting they are not perfectly normally distributed. The Shapiro-Wilk test confirms this, with a p-value of 0.0000 indicating a significant deviation from normality. The QQ plot likely shows curved tails, and if the residuals vs. fitted plot displays a funnel shape or curve, this would suggest a violation of linearity or constant variance. While linear regression may still be appropriate due to its robustness, a transformation (such as log) could help normalize residuals if strong patterns are observed.
Cardiovascular Diseases This model shows substantial issues with its residuals. The residual mean is 5.2 (ideally it should be closer to 0), and the skewness is very high (4.64), indicating extreme right-skew. The kurtosis value of 59.11 is also very large, pointing to heavy tails and likely outliers. With a Shapiro-Wilk p-value of 0.0000, the residuals strongly violate the assumption of normality. The QQ plot likely shows large deviations from the diagonal, and the residuals vs. fitted plot probably reveals non-random patterns and uneven spread. A log transformation of the target variable, robust regression methods, or switching to non-linear models like Random Forest may help address these issues.
Diabetes For the diabetes model, the residuals also have a near-zero mean and show moderate right skew (skewness = 1.489) with heavier tails than normal (kurtosis = 5.47). Though not extreme, the Shapiro-Wilk test still reports a p-value of 0.0000, suggesting the residuals are not normally distributed. The QQ plot likely indicates a right-skewed distribution, but the deviation is less severe compared to the cardiovascular model. If the residuals vs. fitted plot does not show any clear patterns or heteroscedasticity, linear regression may still be valid. However, applying log transformation to predictors or the target variable could improve model performance.
Histogram and KDE Plot¶
Histogram and KDE Plot are used to visualize the normalization for each variable
# Histogram and Skewness Summary
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Replace this with your actual DataFrame
# df = pd.read_csv('your_dataset.csv')
# Identify numeric columns
numeric_cols = df.select_dtypes(include='number').columns
# Calculate skewness
skewness_summary = df[numeric_cols].skew().sort_values(ascending=False)
print("Skewness Summary:")
print(skewness_summary)
# Plot histogram and KDE for each numeric column
for col in numeric_cols:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(df[col].dropna(), bins=30, kde=False)
plt.title(f'Histogram of {col}')
plt.subplot(1, 2, 2)
sns.kdeplot(df[col].dropna(), shade=True)
plt.title(f'KDE Plot of {col}')
plt.tight_layout()
plt.show()
Skewness Summary: Inflation 75.489967 CPI 25.637506 Cardiovascular diseases 10.419131 GDP 8.488527 Sex ratio 7.718123 Diabetes 1.823593 Income 1.618565 Unemployment Rate 1.487679 Child mortality rate 1.458613 Incomplete tertiary education 1.154120 Median age 0.899402 Gini coefficient 0.820521 Cost of a healthy diet 0.642813 BMI (female) 0.257175 BMI (male) 0.065172 Year -0.002662 Life expectancy -0.691259 dtype: float64
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df[col].dropna(), shade=True)
Outliers Detection¶
The Interquartile Range (IQR) method is used for detecting outliers in this dataset. The reasons as follows:
IQR method is specifically apply for continuous numerical data as most variables in this dataset are continuous numerical, such as Inflation, GDP, CPI etc..
Additionally, IQR method is robust to skewness data, and some of the variables are high skewness, including Inflation, GDP, and CPI. This makes it more suitable than methods like z-score which assume normality.
Since the dataset has very low missing values (< 1.5%), the IQR method can be applied effectively without the need for complex imputation prior to outlier detection. Missing data will not significantly bias the quartile estimates.
The IQR method does not make assumptions for the data normal distribution as most of the variables are skewed, therefore, IQR is appropriate to apply for this dataset.
# Check Outliers
# Iterate only through numeric columns
for col in df.select_dtypes(include='number').columns:
# Ensure the column has numeric data before proceeding
if pd.api.types.is_numeric_dtype(df[col]):
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
print(f"{col}: {len(outliers)} outliers")
else:
print(f"Column '{col}' is not numeric, skipping outlier calculation.")
Year: 0 outliers Cost of a healthy diet: 434 outliers Income: 1866 outliers Inflation: 1801 outliers Child mortality rate: 845 outliers Unemployment Rate: 426 outliers Life expectancy: 89 outliers Incomplete tertiary education: 208 outliers Gini coefficient: 370 outliers Diabetes: 1205 outliers BMI (female): 508 outliers Cardiovascular diseases: 2708 outliers BMI (male): 72 outliers Sex ratio: 750 outliers GDP: 2876 outliers Median age: 52 outliers CPI: 1494 outliers
Boxplot¶
Boxplots is a good tool of offering a visual summary of the distribution, skewness, and variability for each numeric variable in the dataset.
# Boxplot
import seaborn as sns
import matplotlib.pyplot as plt
# Loop through all numeric columns to create boxplots
for col in df.select_dtypes(include='number').columns:
# Get the data for the current numeric column
column_data = df[col].dropna() # Drop NaN values to avoid potential issues with plotting
# Check if there is enough data for plotting (at least one non-null value)
if len(column_data) > 0:
sns.boxplot(x=column_data)
plt.title(f"Boxplot of {col}")
plt.xlabel(col)
plt.show()
else:
print(f"Not enough data to generate boxplot for column: {col}")
Impute missing values with Mean / Median / Mode Imputation for Training Set only¶
According to the Skewness Summary, approx zero used mean imputation; > 0.5 or < -0.5 used median imputation
Imputation apply to training set only, avoid data leakage
# Imputation and Train-Test Split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Columns to impute
mean_impute_cols = ['BMI (female)', 'BMI (male)']
median_impute_cols = [
'CPI', 'Gini coefficient', 'Income', 'Inflation', 'Unemployment Rate', 'Diabetes',
'Cardiovascular diseases', 'GDP', 'Cost of a healthy diet',
'Incomplete tertiary education', 'Child mortality rate',
'Life expectancy', 'Sex ratio', 'Median age'
]
# Containers for all-country data
train_dfs = []
test_dfs = []
# --- Per-country processing ---
for country in df['Country'].unique():
df_country = df[df['Country'] == country].sort_values('Year').reset_index(drop=True)
# Skip countries with very few rows
if len(df_country) < 5:
continue
# Time-based train/test split (80% train)
split_index = int(len(df_country) * 0.8)
train_country = df_country.iloc[:split_index].copy()
test_country = df_country.iloc[split_index:].copy()
# --- Mean imputation ---
for col in mean_impute_cols:
if col in train_country.columns:
mean_val = train_country[col].mean()
if np.isnan(mean_val):
mean_val = 0 # Fallback if all values are missing
train_country[col].fillna(mean_val, inplace=True)
test_country[col].fillna(mean_val, inplace=True)
# --- Median imputation with fallback to (median - 1) or -1 ---
for col in median_impute_cols:
if col in train_country.columns:
median_val = train_country[col].median()
if np.isnan(median_val):
fill_val = -1
else:
fill_val = median_val - 1
train_country[col].fillna(fill_val, inplace=True)
test_country[col].fillna(fill_val, inplace=True)
# Add Country column explicitly before appending
train_country['Country'] = country
test_country['Country'] = country
# Store per-country processed data
train_dfs.append(train_country)
test_dfs.append(test_country)
# Combine all countries into unified train/test sets
train_all = pd.concat(train_dfs, ignore_index=True)
test_all = pd.concat(test_dfs, ignore_index=True)
train_all = train_all.set_index(['Country', 'Year'])
test_all = test_all.set_index(['Country', 'Year'])
print(train_all.head()) # should now show Country and Year as index
print(train_all.index.names) # ['Country', 'Year']
# Final check
print(" Missing values after imputation (Train):")
print(train_all.isnull().sum())
print("\n Missing values after imputation (Test):")
print(test_all.isnull().sum())
/tmp/ipython-input-5-3768071567.py:39: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
train_country[col].fillna(mean_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:40: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
test_country[col].fillna(mean_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:50: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
train_country[col].fillna(fill_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:51: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
test_country[col].fillna(fill_val, inplace=True)
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 1950 -1.0 -1.0 9.68342
1951 -1.0 -1.0 9.68342
1952 -1.0 -1.0 9.68342
1953 -1.0 -1.0 9.68342
1954 -1.0 -1.0 9.68342
Child mortality rate Unemployment Rate Life expectancy \
Country Year
Afghanistan 1950 41.370100 6.9405 28.1563
1951 40.799400 6.9405 28.5836
1952 40.224000 6.9405 29.0138
1953 39.642300 6.9405 29.4521
1954 39.158897 6.9405 29.6975
Incomplete tertiary education Gini coefficient Diabetes \
Country Year
Afghanistan 1950 0.3 -1.0 6.2
1951 0.3 -1.0 6.2
1952 0.3 -1.0 6.2
1953 0.3 -1.0 6.2
1954 0.3 -1.0 6.2
BMI (female) Cardiovascular diseases BMI (male) \
Country Year
Afghanistan 1950 21.054667 3.97278 20.593152
1951 21.054667 3.97278 20.593152
1952 21.054667 3.97278 20.593152
1953 21.054667 3.97278 20.593152
1954 21.054667 3.97278 20.593152
Sex ratio GDP Median age CPI
Country Year
Afghanistan 1950 99.845600 4.186536e+10 18.395 75.438705
1951 101.637560 4.186536e+10 18.370 75.438705
1952 101.717354 4.186536e+10 18.333 75.438705
1953 101.792820 4.186536e+10 18.289 75.438705
1954 101.880760 4.186536e+10 18.239 75.438705
['Country', 'Year']
Missing values after imputation (Train):
Cost of a healthy diet 0
Income 0
Inflation 0
Child mortality rate 0
Unemployment Rate 0
Life expectancy 0
Incomplete tertiary education 0
Gini coefficient 0
Diabetes 0
BMI (female) 0
Cardiovascular diseases 0
BMI (male) 0
Sex ratio 0
GDP 0
Median age 0
CPI 0
dtype: int64
Missing values after imputation (Test):
Cost of a healthy diet 0
Income 0
Inflation 0
Child mortality rate 0
Unemployment Rate 0
Life expectancy 0
Incomplete tertiary education 0
Gini coefficient 0
Diabetes 0
BMI (female) 0
Cardiovascular diseases 0
BMI (male) 0
Sex ratio 0
GDP 0
Median age 0
CPI 0
dtype: int64
The above result verify that all missing value have been imputed.
Spearman Correlation¶
# Spearman Correlation matrix and heatmap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # Import numpy for np.number
# Compute correlation matrix - Select only numeric columns
corr_method = 'spearman'
# Select only numeric columns for correlation calculation
df_numeric = df.select_dtypes(include=np.number)
corr_matrix = df_numeric.corr(method=corr_method)
# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title(f'{corr_method.capitalize()} Correlation Heatmap')
plt.show()
Multicollinearity Check using VIF¶
Variance Inflation Factor (VIF) measures how much the variance of a regression coefficient is inflated due to multicollinearity among predictor variables. Multicollinearity occurs when predictors are highly correlated with each other, which can lead to unstable coefficient estimates, inflated standard errors, and difficulty in interpreting the individual effects of variables.
VIF with values above 5 or 10 typically indicating problematic multicollinearity. Using VIF helps identify redundant features, guides feature selection, and improves model interpretability by ensuring stable and meaningful coefficient estimates.
For this dataset, which includes continuous numeric variables such as GDP and Income, and uses linear regression models to analyze health outcomes like life expectancy and cardiovascular diseases.
This approach is supported by foundational econometrics and statistical learning literature, including works by Gujarati (2003) and James et al. (2013), as well as applied health research where socioeconomic and health indicators often exhibit correlations.
Overall, incorporating VIF checks enhances the reliability of your regression models, especially when interpreting the impact of predictors.
# VIF Test for checking multicollinarity
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd
# Select the features for VIF calculation (excluding target variables)
vif_features = [
'Income', 'GDP', 'CPI', 'Sex ratio', 'BMI (female)', 'Cost of a healthy diet',
'Inflation', 'Incomplete tertiary education', 'Gini coefficient', 'Median age',
'BMI (male)', 'Unemployment Rate', 'Child mortality rate'
]
# Drop rows with missing values
vif_data = train_all[vif_features].dropna()
# Add constant term for intercept
vif_data_const = add_constant(vif_data)
# Calculate VIF
vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data_const.columns
vif_df["VIF"] = [variance_inflation_factor(vif_data_const.values, i) for i in range(vif_data_const.shape[1])]
# Display VIF values
print("\nVariance Inflation Factors:")
print(vif_df)
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1782: RuntimeWarning: divide by zero encountered in scalar divide return 1 - self.ssr/self.centered_tss
Variance Inflation Factors:
Feature VIF
0 const 0.000000
1 Income 1.338307
2 GDP 1.024624
3 CPI 1.003511
4 Sex ratio 1.126552
5 BMI (female) 6.641593
6 Cost of a healthy diet 1.246448
7 Inflation 1.001940
8 Incomplete tertiary education 1.095969
9 Gini coefficient 1.272552
10 Median age 1.569114
11 BMI (male) 7.140645
12 Unemployment Rate 1.083354
13 Child mortality rate 1.567488
The result of VIF shows that BMI(female) and BMI(male) have high correlation as BMI(female) and BMI(male) have VIF of 7.00 and 7.39 respectively. According to (Kutner, 2005) and (O'Brien, 2007) that VIF greater than 5 could consider to be high correlation. To solve this problem, both BMI(female) and BMI(male) will be combined.
# Combine variable BMI(female) and BMI(male) by using their average for dataset
# Create combined BMI feature
df['BMI_avg'] = (df['BMI (female)'] + df['BMI (male)']) / 2
# Drop the original columns
df.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)
# Combine variable BMI(female) and BMI(male) by using their average for train set and test set
# Create combined BMI feature for train and test set
train_all['BMI_avg'] = (train_all['BMI (female)'] + train_all['BMI (male)']) / 2
test_all['BMI_avg'] = (test_all['BMI (female)'] + test_all['BMI (male)']) / 2
# Drop the original columns
train_all.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)
test_all.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)
# Check first few column
print("Train Set")
print(train_all.head())
print("Test Set")
print(test_all.head())
Train Set
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 1950 -1.0 -1.0 9.68342
1951 -1.0 -1.0 9.68342
1952 -1.0 -1.0 9.68342
1953 -1.0 -1.0 9.68342
1954 -1.0 -1.0 9.68342
Child mortality rate Unemployment Rate Life expectancy \
Country Year
Afghanistan 1950 41.370100 6.9405 28.1563
1951 40.799400 6.9405 28.5836
1952 40.224000 6.9405 29.0138
1953 39.642300 6.9405 29.4521
1954 39.158897 6.9405 29.6975
Incomplete tertiary education Gini coefficient Diabetes \
Country Year
Afghanistan 1950 0.3 -1.0 6.2
1951 0.3 -1.0 6.2
1952 0.3 -1.0 6.2
1953 0.3 -1.0 6.2
1954 0.3 -1.0 6.2
Cardiovascular diseases Sex ratio GDP \
Country Year
Afghanistan 1950 3.97278 99.845600 4.186536e+10
1951 3.97278 101.637560 4.186536e+10
1952 3.97278 101.717354 4.186536e+10
1953 3.97278 101.792820 4.186536e+10
1954 3.97278 101.880760 4.186536e+10
Median age CPI BMI_avg
Country Year
Afghanistan 1950 18.395 75.438705 20.823909
1951 18.370 75.438705 20.823909
1952 18.333 75.438705 20.823909
1953 18.289 75.438705 20.823909
1954 18.239 75.438705 20.823909
Test Set
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 2009 -1.0 -1.0 -6.811161
2010 -1.0 -1.0 2.178538
2011 -1.0 -1.0 11.804186
2012 -1.0 -1.0 6.441213
2013 -1.0 -1.0 7.385772
Child mortality rate Unemployment Rate Life expectancy \
Country Year
Afghanistan 2009 9.361400 7.914 60.2478
2010 9.023900 7.914 60.7018
2011 8.631701 7.916 61.2503
2012 8.290600 7.909 61.7349
2013 7.978200 7.919 62.1878
Incomplete tertiary education Gini coefficient Diabetes \
Country Year
Afghanistan 2009 8.5 -1.0 10.5
2010 8.8 -1.0 10.8
2011 8.8 -1.0 11.1
2012 8.8 -1.0 11.3
2013 8.8 -1.0 11.6
Cardiovascular diseases Sex ratio GDP \
Country Year
Afghanistan 2009 5.004783 105.540780 7.045116e+10
2010 5.041143 105.446550 8.056966e+10
2011 5.226536 105.328636 8.091317e+10
2012 5.342172 105.202095 9.123145e+10
2013 5.491725 105.091530 9.634110e+10
Median age CPI BMI_avg
Country Year
Afghanistan 2009 14.448 97.867910 22.721053
2010 14.608 100.000000 22.824260
2011 14.776 111.804184 22.928318
2012 14.947 119.005730 23.033418
2013 15.124 127.795220 23.139556
# VIF Test again after combined a new variable BMI_avg
# VIF Test for checking multicollinarity
# Select the features for VIF calculation (excluding target variables)
vif_features = [
'Income', 'GDP', 'CPI', 'Sex ratio', 'BMI_avg', 'Cost of a healthy diet',
'Inflation', 'Incomplete tertiary education', 'Gini coefficient', 'Median age',
'Unemployment Rate', 'Child mortality rate'
]
# Drop rows with missing values
vif_data = train_all[vif_features].dropna()
# Add constant term for intercept
vif_data_const = add_constant(vif_data)
# Calculate VIF
vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data_const.columns
vif_df["VIF"] = [variance_inflation_factor(vif_data_const.values, i) for i in range(vif_data_const.shape[1])]
# Display VIF values
print("\nVariance Inflation Factors:")
print(vif_df)
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1782: RuntimeWarning: divide by zero encountered in scalar divide return 1 - self.ssr/self.centered_tss
Variance Inflation Factors:
Feature VIF
0 const 0.000000
1 Income 1.151946
2 GDP 1.021710
3 CPI 1.003505
4 Sex ratio 1.123716
5 BMI_avg 1.099278
6 Cost of a healthy diet 1.246317
7 Inflation 1.001889
8 Incomplete tertiary education 1.089855
9 Gini coefficient 1.264301
10 Median age 1.521325
11 Unemployment Rate 1.067065
12 Child mortality rate 1.559535
According to the result of above VIF Test, all predictor variables exhibited VIF values below 2, with the combined BMI average (BMI_avg) showing a VIF of approximately 1.11., suggesting that the model coefficients are reliable and not inflated by redundant information. Therefore, the predictors can be interpreted with confidence, and no variables need to be excluded due to multicollinearity.
# Final check
print(" Missing values after imputation (Train):")
print(train_all.isnull().sum())
Missing values after imputation (Train): Cost of a healthy diet 0 Income 0 Inflation 0 Child mortality rate 0 Unemployment Rate 0 Life expectancy 0 Incomplete tertiary education 0 Gini coefficient 0 Diabetes 0 Cardiovascular diseases 0 Sex ratio 0 GDP 0 Median age 0 CPI 0 BMI_avg 0 dtype: int64
Handling Outliers - Winsorization and Yeo-Johnson Transformation¶
- Winsorization
Winsorization is a statistical technique that Capping extreme values at chosen percentiles to minimize the influence of outliers on data analysis, preserving the overall structure of the dataset, can be retains dataset integrity while reducing distortion.
It involves setting a threshold (e.g., the 5th and 95th percentiles) and replacing any values below the lower threshold with the value at that threshold, and any values above the upper threshold with the value at that threshold.
Several studies support Winsorization. Weichle (2023) investigated how different methods for handling outliers and influential observations impact the calculation of medical costs in a dataset and successfully apply Winsorization to cap extreme cost values at the 5th and 95th percentiles to reduce the influence of extreme outliers Balia & Jones (2008): In colon cancer cost data, using Winsorization at 5% (5th–95th percentile) replaced 384 outliers, yielding a more consistent average cost without removing data Carrascosa (2025): Provides a “complete guide” to handling outliers. Hoaglin & Iglewicz (1987) or Rousseeuw & Hubert (1991), both seminal works that recommend Winsorization in robust statistics.
Lu et al. (2024): Winsorization before RNA-seq analysis considerably reduced false positives, improving model performance, and was recommended at 95%
Pachter (2024) investigate the effective percentage of capping applying Winsorization, 93%, 95% and 987% are being tested. Concluded that use 95% for applying Winsorization is the best.
- Yeo-Johnson Transformation
Yeo-Johnson transformation is used after handled outliers by Winsorization. This dataset contains multiple continuous numeric variables (such as income, BMI, GDP, and health-related indicators) that show skewed distributions. Skewness causes coeficient bias, poor model fit, inefficient forecast and negatively impact many modeling techniques by violating assumptions such as ARIMA, linear regression and part of Prophet, that assume the features are normal distributed, therefore handle outliers is essential for generating reliable and stable predictions.
Yeo-Johnson transformation is used in this dataset. The Yeo-Johnson transformation is a statistical technique used to normalize data to make it more symmetrical and reduce skewness. The reason of using Yeo-Johnson because this method appropriate to work with continuous numeric varibles as this dataset primarily composed of continuous numerical features such as GDP, BMI, income, and other health indicators, exactly match the type of data Yeo-Johnson is designed to handle. Additionally, Yeo-Johnson transformation can handle positive, negative and zero values, which is suitable to handle this dataset that contains negative and zero values, such as inflation. Furthermore, Yeo-Johnson improve normality and reduce skewness, which appropriate to use as the assumption of ARIMA, Prophet, linear regression require normal distributed residual, which enhances model validity and stablility.
Compatible with integer or float data Yeo-Johnson can be applied to both integer and float types (e.g., "Median age"), eliminating the need for manual type conversion.
Not applicable to categorical variables Your dataset includes one categorical feature (e.g., "Country"), which should be excluded from this transformation. Yeo-Johnson is only suitable for numeric features.
One of the categorical variable (e.g., Country) should be excluded This transformation is not meant for categorical data — but that's fine. Just apply it only to your numeric columns.
Several studies have demonstrated the effectiveness of the Yeo-Johnson transformation in addressing these issues. For example, Zhang et al. (2018) applied the Yeo-Johnson transformation to normalize skewed biomarker and health outcome data prior to predictive modeling. Min et al. (2020) used the method to correct skewness in economic variables such as income and expenditure before conducting regression analysis. Similarly, Wang et al. (2019) employed Yeo-Johnson to transform environmental pollutant data, including values that were zero or negative, leading to improved model fit and interpretability. These studies provide strong evidence that Yeo-Johnson is a robust and versatile transformation suitable for datasets like yours.
Yeo-Johnson transformation benefits ARIMA and Prophet by improving normality and variance stability, helping assumptions and model fit.
# Winsorization and Yeo_Johnson
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
# Additional Assign index: Country and Year
#df = df.set_index(['Country', 'Year'])
# List columns to transform (excluding only identifiers)
exclude_cols = ['Country', 'Year', 'Life expectancy', 'Diabetes', 'Cardiovascular diseases']
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
numeric_cols = [
col for col in train_all.columns
if col not in exclude_cols
]
# --- Step 1: Winsorization at 5‑95% ---
def winsorize_df(df, cols, lower_q=0.05, upper_q=0.95):
df_w = train_all.copy()
limits = {}
for col in cols:
lower = train_all[col].quantile(lower_q)
upper = train_all[col].quantile(upper_q)
limits[col] = (lower, upper)
df_w[col] = np.clip(df[col], lower, upper)
return df_w, limits
# Apply Winsorization to train set
train_df_w, limits = winsorize_df(train_all, numeric_cols, 0.05, 0.95)
# Apply same limits to test set
test_df_w = test_all.copy()
for col, (low, high) in limits.items():
test_df_w[col] = np.clip(test_all[col], low, high)
# --- Step 2: Yeo‑Johnson Transformation ---
pt = PowerTransformer(method='yeo-johnson', standardize=False)
# Fit on winsorized train data
train_df_transformed = train_df_w.copy()
train_df_transformed[numeric_cols] = pt.fit_transform(train_df_w[numeric_cols])
# Apply to test data
test_df_transformed = test_df_w.copy()
test_df_transformed[numeric_cols] = pt.transform(test_df_w[numeric_cols])
# --- Preview results ---
print("=== Train Transformed Sample ===")
print(train_df_transformed[target_cols + [c for c in numeric_cols if c not in target_cols]].head())
print("\n=== Test Transformed Sample ===")
print(test_df_transformed[target_cols + [c for c in numeric_cols if c not in target_cols]].head())
# combine train and test
# --- Step 3: Combine Transformed Train and Test Data ---
df_transformed = pd.concat([train_df_transformed, test_df_transformed], axis=0)
print("\n✅ Combined Transformed DataFrame:")
print(df_transformed.head())
=== Train Transformed Sample ===
Life expectancy Diabetes Cardiovascular diseases \
Country Year
Afghanistan 1950 28.1563 6.2 3.97278
1951 28.5836 6.2 3.97278
1952 29.0138 6.2 3.97278
1953 29.4521 6.2 3.97278
1954 29.6975 6.2 3.97278
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 1950 0.871262 1.069802 2.119237
1951 0.871262 1.069802 2.119237
1952 0.871262 1.069802 2.119237
1953 0.871262 1.069802 2.119237
1954 0.871262 1.069802 2.119237
Child mortality rate Unemployment Rate \
Country Year
Afghanistan 1950 3.288735 1.968604
1951 3.288735 1.968604
1952 3.288735 1.968604
1953 3.288735 1.968604
1954 3.288735 1.968604
Incomplete tertiary education Gini coefficient Sex ratio \
Country Year
Afghanistan 1950 0.26709 0.140308 0.268828
1951 0.26709 0.140308 0.268828
1952 0.26709 0.140308 0.268828
1953 0.26709 0.140308 0.268828
1954 0.26709 0.140308 0.268828
GDP Median age CPI BMI_avg
Country Year
Afghanistan 1950 26.976456 0.704301 9.172623 26.023944
1951 26.976456 0.704280 9.172623 26.023944
1952 26.976456 0.704250 9.172623 26.023944
1953 26.976456 0.704213 9.172623 26.023944
1954 26.976456 0.704172 9.172623 26.023944
=== Test Transformed Sample ===
Life expectancy Diabetes Cardiovascular diseases \
Country Year
Afghanistan 2009 60.2478 10.5 5.004783
2010 60.7018 10.8 5.041143
2011 61.2503 11.1 5.226536
2012 61.7349 11.3 5.342172
2013 62.1878 11.6 5.491725
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 2009 0.871262 1.069802 0.398066
2010 0.871262 1.069802 1.094684
2011 0.871262 1.069802 2.262320
2012 0.871262 1.069802 1.825930
2013 0.871262 1.069802 1.923973
Child mortality rate Unemployment Rate \
Country Year
Afghanistan 2009 2.277596 2.072607
2010 2.246168 2.072607
2011 2.208257 2.072808
2012 2.173980 2.072104
2013 2.141438 2.073110
Incomplete tertiary education Gini coefficient Sex ratio \
Country Year
Afghanistan 2009 2.632942 0.140308 0.268828
2010 2.675227 0.140308 0.268828
2011 2.675227 0.140308 0.268828
2012 2.675227 0.140308 0.268828
2013 2.675227 0.140308 0.268828
GDP Median age CPI BMI_avg
Country Year
Afghanistan 2009 27.608972 0.700798 10.204804 28.607408
2010 27.772489 0.700798 10.294163 28.748552
2011 27.777676 0.700798 10.766714 28.890923
2012 27.924068 0.700798 11.038471 29.034782
2013 27.990582 0.700955 11.355331 29.180124
✅ Combined Transformed DataFrame:
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 1950 0.871262 1.069802 2.119237
1951 0.871262 1.069802 2.119237
1952 0.871262 1.069802 2.119237
1953 0.871262 1.069802 2.119237
1954 0.871262 1.069802 2.119237
Child mortality rate Unemployment Rate Life expectancy \
Country Year
Afghanistan 1950 3.288735 1.968604 28.1563
1951 3.288735 1.968604 28.5836
1952 3.288735 1.968604 29.0138
1953 3.288735 1.968604 29.4521
1954 3.288735 1.968604 29.6975
Incomplete tertiary education Gini coefficient Diabetes \
Country Year
Afghanistan 1950 0.26709 0.140308 6.2
1951 0.26709 0.140308 6.2
1952 0.26709 0.140308 6.2
1953 0.26709 0.140308 6.2
1954 0.26709 0.140308 6.2
Cardiovascular diseases Sex ratio GDP Median age \
Country Year
Afghanistan 1950 3.97278 0.268828 26.976456 0.704301
1951 3.97278 0.268828 26.976456 0.704280
1952 3.97278 0.268828 26.976456 0.704250
1953 3.97278 0.268828 26.976456 0.704213
1954 3.97278 0.268828 26.976456 0.704172
CPI BMI_avg
Country Year
Afghanistan 1950 9.172623 26.023944
1951 9.172623 26.023944
1952 9.172623 26.023944
1953 9.172623 26.023944
1954 9.172623 26.023944
# Verify Index
print(train_df_transformed.index.names)
['Country', 'Year']
Lag Feature¶
Lag features are values from previous time steps used as predictors to forecast current or future values.
It suitable to apply for RQ3 forecasting life expectancy, diabetes, or heart disease over time. Lag features will help Prophet and regression models capture dependencies across years more effectively. ARIMA and Random Forest will create lag internally.
Lag is important because it helps to identify patterns and relationships between past and present data points. Time series models, such as ARIMA, heavily rely on lag to capture autocorrelations (the correlation between observations at different time lags) in the data.
Key reasons why lag is essential:
Autocorrelation Detection: Lag enables analysts to understand how current data points are related to previous ones. If there is a significant autocorrelation at a particular lag, it suggests that past values can be used to predict future values. Feature Creation: In machine learning models for time series forecasting, lagged variables are often used as features. These features represent the values of the time series at previous time steps, allowing the model to learn patterns over time. Trend Identification: By observing how values change across different lags, trends and seasonality can be identified. For instance, a consistent increase in lagged values may indicate an upward trend.
In ARIMA, the model forecasts a time series based on the linear relationship between an observation and a number of lagged observations.
Several prior studies proof that Lag Feature is a crucial technique for time series modeling, such as ARIMA, Prophet and Random Forest.
Debón et al. (2017) used lagged mortality rates to forecast life expectancy in European countries. Wang et al. (2019) – used lagged environmental and health variables to predict life expectancy and disease incidence in China. And Chakraborty et al. (2020) – used lagged economic indicators to predict diabetes trends in India.
# Lag Feature
# === STEP 1: Combine transformed train and test sets ===
df_transformed = pd.concat([train_df_transformed, test_df_transformed], axis=0)
# If index is already set, reset it to ensure 'Country' and 'Year' are columns
if 'Country' not in df_transformed.columns or 'Year' not in df_transformed.columns:
df_transformed = df_transformed.reset_index()
# Set MultiIndex for lagging
df_transformed = df_transformed.set_index(['Country', 'Year']).sort_index()
# === STEP 2: Define target and predictor columns ===
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
predictors = [col for col in df_transformed.columns if col not in target_cols and col != 'lagged']
# === STEP 3: Create lagged features ===
def create_lag_features(df, cols, lags=[1, 2, 3]):
df_lag = df.copy()
for col in cols:
for lag in lags:
df_lag[f'{col}_lag{lag}'] = df.groupby(level='Country')[col].shift(lag)
return df_lag
df_lagged = create_lag_features(df_transformed, predictors)
# === STEP 4: Tag lagged vs. unlagged rows ===
df_transformed['lagged'] = False
df_lagged['lagged'] = True
# === STEP 5: Combine both to retain full year coverage ===
df_combined = pd.concat([df_transformed, df_lagged])
df_combined = df_combined.reset_index()
df_combined = df_combined.drop_duplicates(subset=['Country', 'Year'], keep='last')
df_combined = df_combined.set_index(['Country', 'Year']).sort_index()
# === STEP 6: Impute missing values caused by lagging ===
df_combined = (
df_combined
.reset_index()
.groupby('Country', group_keys=False)
.apply(lambda x: x.sort_values('Year').ffill().bfill())
.reset_index(drop=True)
.set_index(['Country', 'Year']).sort_index()
)
# === STEP 7: Final dataset for modeling ===
df_combined_with_country = df_combined.reset_index()
# === STEP 8: Preview sample of lagged features ===
lag_cols = [f'{col}_lag{lag}' for col in predictors for lag in [1, 2, 3]]
print(" Combined Dataset (1950–2023) with Lag Features + Imputed NaNs")
print(df_combined_with_country[['Country', 'Year'] + lag_cols].head(10))
# export and download file
df_combined_with_country.to_csv("df_combined_with_country.csv", index=False)
df_lagged.to_csv("df_lagged.csv", index=False)
from google.colab import files
files.download("df_combined_with_country.csv")
files.download("df_lagged.csv")
/tmp/ipython-input-15-946162427.py:42: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda x: x.sort_values('Year').ffill().bfill())
Combined Dataset (1950–2023) with Lag Features + Imputed NaNs
Country Year Cost of a healthy diet_lag1 \
0 Afghanistan 1950 0.871262
1 Afghanistan 1951 0.871262
2 Afghanistan 1952 0.871262
3 Afghanistan 1953 0.871262
4 Afghanistan 1954 0.871262
5 Afghanistan 1955 0.871262
6 Afghanistan 1956 0.871262
7 Afghanistan 1957 0.871262
8 Afghanistan 1958 0.871262
9 Afghanistan 1959 0.871262
Cost of a healthy diet_lag2 Cost of a healthy diet_lag3 Income_lag1 \
0 0.871262 0.871262 1.069802
1 0.871262 0.871262 1.069802
2 0.871262 0.871262 1.069802
3 0.871262 0.871262 1.069802
4 0.871262 0.871262 1.069802
5 0.871262 0.871262 1.069802
6 0.871262 0.871262 1.069802
7 0.871262 0.871262 1.069802
8 0.871262 0.871262 1.069802
9 0.871262 0.871262 1.069802
Income_lag2 Income_lag3 Inflation_lag1 Inflation_lag2 ... GDP_lag3 \
0 1.069802 1.069802 2.119237 2.119237 ... 26.976456
1 1.069802 1.069802 2.119237 2.119237 ... 26.976456
2 1.069802 1.069802 2.119237 2.119237 ... 26.976456
3 1.069802 1.069802 2.119237 2.119237 ... 26.976456
4 1.069802 1.069802 2.119237 2.119237 ... 26.976456
5 1.069802 1.069802 2.119237 2.119237 ... 26.976456
6 1.069802 1.069802 2.119237 2.119237 ... 26.976456
7 1.069802 1.069802 2.119237 2.119237 ... 26.976456
8 1.069802 1.069802 2.119237 2.119237 ... 26.976456
9 1.069802 1.069802 2.119237 2.119237 ... 26.976456
Median age_lag1 Median age_lag2 Median age_lag3 CPI_lag1 CPI_lag2 \
0 0.704301 0.704301 0.704301 9.172623 9.172623
1 0.704301 0.704301 0.704301 9.172623 9.172623
2 0.704280 0.704301 0.704301 9.172623 9.172623
3 0.704250 0.704280 0.704301 9.172623 9.172623
4 0.704213 0.704250 0.704280 9.172623 9.172623
5 0.704172 0.704213 0.704250 9.172623 9.172623
6 0.704128 0.704172 0.704213 9.172623 9.172623
7 0.704085 0.704128 0.704172 9.172623 9.172623
8 0.704040 0.704085 0.704128 9.172623 9.172623
9 0.703986 0.704040 0.704085 9.172623 9.172623
CPI_lag3 BMI_avg_lag1 BMI_avg_lag2 BMI_avg_lag3
0 9.172623 26.023944 26.023944 26.023944
1 9.172623 26.023944 26.023944 26.023944
2 9.172623 26.023944 26.023944 26.023944
3 9.172623 26.023944 26.023944 26.023944
4 9.172623 26.023944 26.023944 26.023944
5 9.172623 26.023944 26.023944 26.023944
6 9.172623 26.023944 26.023944 26.023944
7 9.172623 26.023944 26.023944 26.023944
8 9.172623 26.023944 26.023944 26.023944
9 9.172623 26.023944 26.023944 26.023944
[10 rows x 38 columns]
# lag feature - REVISED - lag after train test set
import pandas as pd
# === STEP 1: Ensure 'Country' and 'Year' are columns (if set as index)
if train_df_transformed.index.names == ['Country', 'Year']:
train_df_transformed = train_df_transformed.reset_index()
if test_df_transformed.index.names == ['Country', 'Year']:
test_df_transformed = test_df_transformed.reset_index()
# === STEP 2: Set MultiIndex for lagging
train_df_transformed = train_df_transformed.set_index(['Country', 'Year']).sort_index()
test_df_transformed = test_df_transformed.set_index(['Country', 'Year']).sort_index()
# === STEP 3: Define target and predictor columns
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
predictors = [col for col in train_df_transformed.columns if col not in target_cols and col != 'lagged']
# === STEP 4: Function to create lag features (only on train set)
def create_lag_features(df, cols, lags=[1, 2, 3]):
df_lag = df.copy()
for col in cols:
for lag in lags:
df_lag[f'{col}_lag{lag}'] = df_lag.groupby(level='Country')[col].shift(lag)
return df_lag
train_lagged = create_lag_features(train_df_transformed, predictors)
# === STEP 5: Impute missing lag values in training set (from early years like 1950–1952)
train_lagged = (
train_lagged
.reset_index()
.groupby('Country', group_keys=False)
.apply(lambda x: x.sort_values('Year').ffill().bfill())
.reset_index(drop=True)
.set_index(['Country', 'Year']).sort_index()
)
# === STEP 6: Add a flag for lagged data (optional, useful for debugging)
train_lagged['lagged'] = True
test_df_transformed['lagged'] = False
# === STEP 7: Combine train and test into full set for modeling/prediction
df_combined = pd.concat([train_lagged, test_df_transformed])
df_combined = df_combined.reset_index()
df_combined = df_combined.sort_values(['Country', 'Year']).drop_duplicates(subset=['Country', 'Year'], keep='last')
df_combined = df_combined.set_index(['Country', 'Year']).sort_index()
# === STEP 8: Final safety fill to ensure no missing cells (optional)
df_combined = (
df_combined
.reset_index()
.groupby('Country', group_keys=False)
.apply(lambda x: x.sort_values('Year').ffill().bfill())
.reset_index(drop=True)
.set_index(['Country', 'Year']).sort_index()
)
# === STEP 9: Reset index for modeling
df_combined_with_country = df_combined.reset_index()
# === STEP 10: Check sample of lag features
lag_cols = [f'{col}_lag{lag}' for col in predictors for lag in [1, 2, 3]]
print("\n✅ Final Combined Dataset (1950–2023) with Lag Features — No Missing Values:")
print(df_combined_with_country[['Country', 'Year'] + lag_cols].head(10))
# export and download file
df_combined_with_country.to_csv("df_combined_with_country.csv", index=False)
df_combined.to_csv("df_combined.csv", index=False)
from google.colab import files
files.download("df_combined_with_country.csv")
files.download("df_combined.csv")
/tmp/ipython-input-14-469693310.py:35: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda x: x.sort_values('Year').ffill().bfill())
/tmp/ipython-input-14-469693310.py:55: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda x: x.sort_values('Year').ffill().bfill())
✅ Final Combined Dataset (1950–2023) with Lag Features — No Missing Values:
Country Year Cost of a healthy diet_lag1 \
0 Afghanistan 1950 0.871262
1 Afghanistan 1951 0.871262
2 Afghanistan 1952 0.871262
3 Afghanistan 1953 0.871262
4 Afghanistan 1954 0.871262
5 Afghanistan 1955 0.871262
6 Afghanistan 1956 0.871262
7 Afghanistan 1957 0.871262
8 Afghanistan 1958 0.871262
9 Afghanistan 1959 0.871262
Cost of a healthy diet_lag2 Cost of a healthy diet_lag3 Income_lag1 \
0 0.871262 0.871262 1.069802
1 0.871262 0.871262 1.069802
2 0.871262 0.871262 1.069802
3 0.871262 0.871262 1.069802
4 0.871262 0.871262 1.069802
5 0.871262 0.871262 1.069802
6 0.871262 0.871262 1.069802
7 0.871262 0.871262 1.069802
8 0.871262 0.871262 1.069802
9 0.871262 0.871262 1.069802
Income_lag2 Income_lag3 Inflation_lag1 Inflation_lag2 ... GDP_lag3 \
0 1.069802 1.069802 2.119237 2.119237 ... 26.976456
1 1.069802 1.069802 2.119237 2.119237 ... 26.976456
2 1.069802 1.069802 2.119237 2.119237 ... 26.976456
3 1.069802 1.069802 2.119237 2.119237 ... 26.976456
4 1.069802 1.069802 2.119237 2.119237 ... 26.976456
5 1.069802 1.069802 2.119237 2.119237 ... 26.976456
6 1.069802 1.069802 2.119237 2.119237 ... 26.976456
7 1.069802 1.069802 2.119237 2.119237 ... 26.976456
8 1.069802 1.069802 2.119237 2.119237 ... 26.976456
9 1.069802 1.069802 2.119237 2.119237 ... 26.976456
Median age_lag1 Median age_lag2 Median age_lag3 CPI_lag1 CPI_lag2 \
0 0.704301 0.704301 0.704301 9.172623 9.172623
1 0.704301 0.704301 0.704301 9.172623 9.172623
2 0.704280 0.704301 0.704301 9.172623 9.172623
3 0.704250 0.704280 0.704301 9.172623 9.172623
4 0.704213 0.704250 0.704280 9.172623 9.172623
5 0.704172 0.704213 0.704250 9.172623 9.172623
6 0.704128 0.704172 0.704213 9.172623 9.172623
7 0.704085 0.704128 0.704172 9.172623 9.172623
8 0.704040 0.704085 0.704128 9.172623 9.172623
9 0.703986 0.704040 0.704085 9.172623 9.172623
CPI_lag3 BMI_avg_lag1 BMI_avg_lag2 BMI_avg_lag3
0 9.172623 26.023944 26.023944 26.023944
1 9.172623 26.023944 26.023944 26.023944
2 9.172623 26.023944 26.023944 26.023944
3 9.172623 26.023944 26.023944 26.023944
4 9.172623 26.023944 26.023944 26.023944
5 9.172623 26.023944 26.023944 26.023944
6 9.172623 26.023944 26.023944 26.023944
7 9.172623 26.023944 26.023944 26.023944
8 9.172623 26.023944 26.023944 26.023944
9 9.172623 26.023944 26.023944 26.023944
[10 rows x 38 columns]
Restore Index - Country and Year¶
## restore Country and year for df_transformed
# Restore index — only if not already set
if 'Country' not in df_transformed.index.names or 'Year' not in df_transformed.index.names:
df_transformed = df_transformed.set_index(['Country', 'Year'])
# Optional: sort for time-aware operations
df_transformed = df_transformed.sort_index()
# Preview index structure
print("✅ Index restored — here’s a sample:")
print(df_transformed.head())
✅ Index restored — here’s a sample:
Cost of a healthy diet Income Inflation \
Country Year
Afghanistan 1950 0.871262 1.069802 2.119237
1951 0.871262 1.069802 2.119237
1952 0.871262 1.069802 2.119237
1953 0.871262 1.069802 2.119237
1954 0.871262 1.069802 2.119237
Child mortality rate Unemployment Rate Life expectancy \
Country Year
Afghanistan 1950 3.288735 1.968604 28.1563
1951 3.288735 1.968604 28.5836
1952 3.288735 1.968604 29.0138
1953 3.288735 1.968604 29.4521
1954 3.288735 1.968604 29.6975
Incomplete tertiary education Gini coefficient Diabetes \
Country Year
Afghanistan 1950 0.26709 0.140308 6.2
1951 0.26709 0.140308 6.2
1952 0.26709 0.140308 6.2
1953 0.26709 0.140308 6.2
1954 0.26709 0.140308 6.2
Cardiovascular diseases Sex ratio GDP Median age \
Country Year
Afghanistan 1950 3.97278 0.268828 26.976456 0.704301
1951 3.97278 0.268828 26.976456 0.704280
1952 3.97278 0.268828 26.976456 0.704250
1953 3.97278 0.268828 26.976456 0.704213
1954 3.97278 0.268828 26.976456 0.704172
CPI BMI_avg lagged
Country Year
Afghanistan 1950 9.172623 26.023944 False
1951 9.172623 26.023944 False
1952 9.172623 26.023944 False
1953 9.172623 26.023944 False
1954 9.172623 26.023944 False
Identify the Best Feature Selection Method and The Best Number of Features for Modeling¶
Comparison for three types of Feature selection method by using RMSE with the best number of feature used in the modeling:
- LASSO (Least Absolute Shrinkage and Selection Operator)
- RFE (Recursive Feature Elimination)
- Forward Selection
- Random Forest Each method selects features based on different principles, and for each method, the code evaluates models using a different number of features—starting from 1 up to a maximum (e.g., 15). For each configuration (method + number of features), the model’s performance is assessed using TimeSeriesSplit cross-validation and Root Mean Squared Error (RMSE) as the evaluation metric. The process is repeated for each of your target variables separately. The method and feature count with the lowest RMSE is considered optimal for that target.
Feature Selection Comparison (Summary and Charts)¶
Compare Feature Selection method and the best nubmer of features using RMSE
# feature selection comparison
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
def find_best_feature_count(X_df, y, max_features=None):
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
feature_names = X_df.columns.tolist()
# 1. Impute missing values
imputer = SimpleImputer(strategy='mean') # or 'median', 'most_frequent'
X_imputed = imputer.fit_transform(X_df)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
# --- Scale X and y ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
y_imputed = y_imputed.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y_imputed).ravel()
y_original = y_imputed.ravel()
#y_scaled = y_scaler.fit_transform(y).ravel()
#y_original = y.ravel()
tscv = TimeSeriesSplit(n_splits=3)
def rmse_on_original_scale(model, X_subset):
y_preds, y_tests = [], []
for train_idx, test_idx in tscv.split(X_subset):
model.fit(X_subset[train_idx], y_scaled[train_idx])
y_pred_scaled = model.predict(X_subset[test_idx])
y_pred_original = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_preds.extend(y_pred_original)
y_tests.extend(y_original[test_idx])
return np.sqrt(mean_squared_error(y_tests, y_preds))
# --- Feature Selection ---
max_features = min(max_features or 20, X_scaled.shape[1] - 1)
lasso = LassoCV(cv=tscv, random_state=42).fit(X_scaled, y_scaled)
lasso_coef = lasso.coef_
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_scaled, y_scaled)
importances = rf_model.feature_importances_
lasso_rmse_list, rfe_rmse_list, sfs_rmse_list, rf_rmse_list = [], [], [], []
step = 2
for n in range(1, max_features + 1, step):
idx_lasso = np.argsort(np.abs(lasso_coef))[-n:]
X_lasso = X_scaled[:, idx_lasso]
lasso_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_lasso)))
try:
rfe = RFE(LinearRegression(), n_features_to_select=n)
X_rfe = rfe.fit_transform(X_scaled, y_scaled)
rfe_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_rfe)))
except:
rfe_rmse_list.append((n, np.nan))
try:
sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n, direction='forward', cv=tscv, n_jobs=-1)
X_sfs = sfs.fit_transform(X_scaled, y_scaled)
sfs_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_sfs)))
except:
sfs_rmse_list.append((n, np.nan))
idx_rf = np.argsort(importances)[-n:]
X_rf = X_scaled[:, idx_rf]
rf_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_rf)))
df_combined = (
pd.DataFrame(lasso_rmse_list, columns=['n_features', 'LASSO_RMSE'])
.merge(pd.DataFrame(rfe_rmse_list, columns=['n_features', 'RFE_RMSE']), on='n_features')
.merge(pd.DataFrame(sfs_rmse_list, columns=['n_features', 'Forward_RMSE']), on='n_features')
.merge(pd.DataFrame(rf_rmse_list, columns=['n_features', 'RF_RMSE']), on='n_features')
)
# --- Feature Names ---
best_lasso_n = df_combined.loc[df_combined['LASSO_RMSE'].idxmin(), 'n_features']
best_rfe_n = df_combined.loc[df_combined['RFE_RMSE'].idxmin(), 'n_features']
best_sfs_n = df_combined.loc[df_combined['Forward_RMSE'].idxmin(), 'n_features']
best_rf_n = df_combined.loc[df_combined['RF_RMSE'].idxmin(), 'n_features']
lasso_features = [feature_names[i] for i in np.argsort(np.abs(lasso_coef))[-best_lasso_n:]]
rfe = RFE(LinearRegression(), n_features_to_select=best_rfe_n).fit(X_scaled, y_scaled)
rfe_features = [feature_names[i] for i, flag in enumerate(rfe.support_) if flag]
sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=best_sfs_n, direction='forward', cv=tscv).fit(X_scaled, y_scaled)
sfs_features = [feature_names[i] for i, flag in enumerate(sfs.get_support()) if flag]
rf_features = [feature_names[i] for i in np.argsort(importances)[-best_rf_n:]]
best_methods = {
'LASSO': {'n_features': best_lasso_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_lasso_n, 'LASSO_RMSE'].values[0], 'features': lasso_features},
'RFE': {'n_features': best_rfe_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_rfe_n, 'RFE_RMSE'].values[0], 'features': rfe_features},
'Forward': {'n_features': best_sfs_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_sfs_n, 'Forward_RMSE'].values[0], 'features': sfs_features},
'RandomForest': {'n_features': best_rf_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_rf_n, 'RF_RMSE'].values[0], 'features': rf_features}
}
return df_combined, best_methods
import matplotlib.pyplot as plt
target_cols = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
results = {}
for target in target_cols:
lag_cols = [f'{target}_lag1', f'{target}_lag2']
cols_to_drop = target_cols + [col for col in lag_cols if col in df_lagged.columns]
X = df_lagged.drop(columns=cols_to_drop)
y = df_lagged[target]
print(f"\n🔍 Feature selection for target: {target}")
df_combined, best_methods = find_best_feature_count(X, y)
results[target] = {'df_combined': df_combined, 'best_methods': best_methods}
for method, info in best_methods.items():
print(f"\nMethod: {method}")
print(f"Best number of features: {info['n_features']}")
print(f"Best RMSE: {info['rmse']:.4f}")
print(f"Selected features: {info['features']}")
plt.figure(figsize=(10,6))
plt.plot(df_combined['n_features'], df_combined['LASSO_RMSE'], label='LASSO', marker='o')
plt.plot(df_combined['n_features'], df_combined['RFE_RMSE'], label='RFE', marker='s')
plt.plot(df_combined['n_features'], df_combined['Forward_RMSE'], label='Forward', marker='^')
plt.plot(df_combined['n_features'], df_combined['RF_RMSE'], label='Random Forest', marker='v')
plt.xlabel('Number of Features')
plt.ylabel('RMSE')
plt.title(f'RMSE vs Number of Features for Target: {target}')
plt.grid(True)
plt.legend()
plt.show()
🔍 Feature selection for target: Cardiovascular diseases Method: LASSO Best number of features: 3 Best RMSE: 144.6855 Selected features: ['BMI_avg_lag2', 'BMI_avg_lag3', 'GDP'] Method: RFE Best number of features: 3 Best RMSE: 144.5087 Selected features: ['Income', 'GDP', 'BMI_avg'] Method: Forward Best number of features: 11 Best RMSE: 145.4053 Selected features: ['Unemployment Rate', 'Incomplete tertiary education', 'GDP', 'Unemployment Rate_lag1', 'Unemployment Rate_lag2', 'Unemployment Rate_lag3', 'Incomplete tertiary education_lag1', 'Incomplete tertiary education_lag2', 'Incomplete tertiary education_lag3', 'GDP_lag1', 'lagged'] Method: RandomForest Best number of features: 3 Best RMSE: 145.3185 Selected features: ['GDP_lag1', 'GDP', 'Incomplete tertiary education']
🔍 Feature selection for target: Diabetes Method: LASSO Best number of features: 11 Best RMSE: 3.6356 Selected features: ['Incomplete tertiary education_lag3', 'Sex ratio_lag3', 'Incomplete tertiary education', 'Income_lag3', 'CPI', 'Median age_lag3', 'Cost of a healthy diet', 'Income', 'GDP', 'BMI_avg', 'BMI_avg_lag3'] Method: RFE Best number of features: 9 Best RMSE: 3.6424 Selected features: ['Cost of a healthy diet', 'Income', 'Incomplete tertiary education', 'GDP', 'CPI', 'BMI_avg', 'Income_lag3', 'Median age_lag3', 'BMI_avg_lag3'] Method: Forward Best number of features: 17 Best RMSE: 3.6281 Selected features: ['Income', 'Inflation', 'Child mortality rate', 'Incomplete tertiary education', 'Sex ratio', 'GDP', 'BMI_avg', 'Cost of a healthy diet_lag2', 'Income_lag1', 'Income_lag2', 'Income_lag3', 'Inflation_lag3', 'Sex ratio_lag3', 'CPI_lag3', 'BMI_avg_lag1', 'BMI_avg_lag3', 'lagged'] Method: RandomForest Best number of features: 17 Best RMSE: 3.6859 Selected features: ['CPI', 'Median age', 'Unemployment Rate_lag3', 'Inflation', 'Unemployment Rate_lag1', 'Median age_lag3', 'GDP_lag1', 'Cost of a healthy diet', 'Gini coefficient', 'Incomplete tertiary education', 'Unemployment Rate', 'GDP', 'Income', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg_lag3', 'BMI_avg']
🔍 Feature selection for target: Life expectancy Method: LASSO Best number of features: 5 Best RMSE: 3.5032 Selected features: ['Median age_lag3', 'Sex ratio', 'GDP', 'Child mortality rate_lag3', 'Child mortality rate'] Method: RFE Best number of features: 7 Best RMSE: 3.5002 Selected features: ['Child mortality rate', 'Sex ratio', 'GDP', 'Median age', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Median age_lag3'] Method: Forward Best number of features: 17 Best RMSE: 3.4964 Selected features: ['Child mortality rate', 'Sex ratio', 'GDP', 'Median age', 'Child mortality rate_lag1', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Sex ratio_lag1', 'Sex ratio_lag2', 'Sex ratio_lag3', 'GDP_lag1', 'GDP_lag2', 'GDP_lag3', 'Median age_lag2', 'BMI_avg_lag2', 'BMI_avg_lag3', 'lagged'] Method: RandomForest Best number of features: 9 Best RMSE: 3.5058 Selected features: ['BMI_avg', 'GDP', 'Sex ratio', 'Income', 'Median age_lag3', 'Median age', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Child mortality rate']
Feature selection Comparison with R sq, MAPE, MSE (Summary and Charts)¶
Compare Feature selection and the best number of features using Metrics (R square, MAPE, MSE)
# Feature selection with R sq, MAPE, MSE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
# Move plot_metrics function definition to the beginning
def plot_metrics(df_combined, target_name):
metrics = ['RMSE', 'MAPE', 'R2']
methods = ['LASSO', 'RFE', 'Forward', 'RandomForest']
for metric in metrics:
plt.figure(figsize=(10,6))
for method in methods:
# Check if the metric column exists for the method before plotting
metric_col = f'{method}_{metric}'
if metric_col in df_combined.columns:
plt.plot(df_combined['n_features'], df_combined[metric_col], label=method, marker='o')
else:
print(f"Warning: Metric column '{metric_col}' not found in DataFrame for plotting.")
plt.title(f'{metric} vs Number of Features ({target_name})')
plt.xlabel('Number of Features')
plt.ylabel(metric)
plt.legend()
plt.grid(True)
plt.show()
def evaluate_model(model, X_subset, y_scaled, y_original, y_scaler, tscv):
y_preds, y_tests = [], []
# Ensure X_subset and y_scaled have the same index for splitting
# Convert X_subset to DataFrame if it's numpy array to use index for splitting
if not isinstance(X_subset, pd.DataFrame):
# Assuming X_subset corresponds to the same rows as y_scaled
X_subset_df = pd.DataFrame(X_subset, index=pd.Series(y_scaled).index)
else:
X_subset_df = X_subset
for train_idx, test_idx in tscv.split(X_subset_df): # Use X_subset_df for splitting
# Select data using indices from the split
X_train, X_test = X_subset_df.iloc[train_idx], X_subset_df.iloc[test_idx]
y_train_scaled, y_test_scaled = y_scaled[train_idx], y_scaled[test_idx]
y_test_original = y_original[test_idx] # Select original y for test set
# Ensure there's data in train and test sets for the current fold
if len(X_train) > 0 and len(X_test) > 0:
try:
model.fit(X_train, y_train_scaled)
y_pred_scaled = model.predict(X_test)
y_pred_original = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_preds.extend(y_pred_original)
y_tests.extend(y_test_original)
except Exception as e:
print(f"Error during model fitting or prediction in a fold: {e}")
# Extend with NaNs or skip if error occurs in a fold
y_preds.extend([np.nan] * len(y_test_original))
y_tests.extend(y_test_original) # Still add the test actuals to keep lists aligned
# Calculate metrics only if y_tests and y_preds are not empty and don't contain NaNs/Infs
y_tests_cleaned = np.array(y_tests)
y_preds_cleaned = np.array(y_preds)
# Remove pairs where either actual or prediction is NaN/Inf
valid_indices = np.isfinite(y_tests_cleaned) & np.isfinite(y_preds_cleaned)
y_tests_cleaned = y_tests_cleaned[valid_indices]
y_preds_cleaned = y_preds_cleaned[valid_indices]
if len(y_tests_cleaned) > 0:
rmse = np.sqrt(mean_squared_error(y_tests_cleaned, y_preds_cleaned))
mape = mean_absolute_percentage_error(y_tests_cleaned, y_preds_cleaned)
r2 = r2_score(y_tests_cleaned, y_preds_cleaned)
else:
# Return NaN if no valid data points for metric calculation
rmse, mape, r2 = np.nan, np.nan, np.nan
return rmse, mape, r2
def find_best_features_with_metrics(X_df, y, max_features=None):
# Ensure X_df has a proper index for splitting in evaluate_model
if not isinstance(X_df.index, pd.MultiIndex):
# Assuming X_df came from df_lagged which has MultiIndex, try to restore it
# This might require passing the original index or ensuring X_df retains it
# For robustness, let's assume X_df needs an index for splitting
# A simpler approach might be to reset index in evaluate_model if it's numpy array
pass # Let's handle index in evaluate_model as it receives X_subset
X_scaler = StandardScaler()
# Fit scaler on X_df values, but keep X_df as DataFrame to retain index
X_scaled_values = X_scaler.fit_transform(X_df.values)
X_scaled_df = pd.DataFrame(X_scaled_values, columns=X_df.columns, index=X_df.index) # Recreate DataFrame with index
feature_names = X_scaled_df.columns.tolist()
y = y.values.reshape(-1, 1) # y is already a Series from df_lagged, convert to numpy array
y_original = y.ravel() # Keep original y values as numpy array
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y).ravel() # Scale y
tscv = TimeSeriesSplit(n_splits=5)
max_features = min(max_features or 30, X_scaled_df.shape[1]) # Max features up to total features
# Handle case where there are no features
if X_scaled_df.shape[1] == 0:
print("No features available in X_df. Skipping feature selection.")
return pd.DataFrame(), {'LASSO': {'n_features': 0, 'rmse': np.nan, 'features': []},
'RFE': {'n_features': 0, 'rmse': np.nan, 'features': []},
'Forward': {'n_features': 0, 'rmse': np.nan, 'features': []},
'RandomForest': {'n_features': 0, 'rmse': np.nan, 'features': []}}
lasso = LassoCV(cv=tscv, random_state=42).fit(X_scaled_df, y_scaled)
lasso_coef = lasso.coef_
# Ensure Random Forest is fitted on X_scaled_df (DataFrame)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_scaled_df, y_scaled)
importances = rf_model.feature_importances_
results = {'LASSO': [], 'RFE': [], 'Forward': [], 'RandomForest': []}
# Max features for loop should be <= total features
max_loop_features = min(max_features, X_scaled_df.shape[1])
for n in range(1, max_loop_features + 1):
# LASSO
idx = np.argsort(np.abs(lasso_coef))[-n:]
# Select columns using index from X_scaled_df
X_subset_lasso = X_scaled_df.iloc[:, idx]
# Pass DataFrame to evaluate_model
results['LASSO'].append((n, *evaluate_model(LinearRegression(), X_subset_lasso, y_scaled, y_original, y_scaler, tscv)))
# RFE
try:
# RFE requires n_features_to_select <= n_features
if n <= X_scaled_df.shape[1]:
rfe = RFE(LinearRegression(), n_features_to_select=n)
# Fit on X_scaled_df (DataFrame) and get transformed numpy array
X_subset_rfe_np = rfe.fit_transform(X_scaled_df, y_scaled)
# Pass numpy array to evaluate_model - evaluate_model handles conversion to DataFrame for splitting
results['RFE'].append((n, *evaluate_model(LinearRegression(), X_subset_rfe_np, y_scaled, y_original, y_scaler, tscv)))
else:
results['RFE'].append((n, np.nan, np.nan, np.nan))
except Exception as e:
print(f"RFE failed for n={n}: {e}")
results['RFE'].append((n, np.nan, np.nan, np.nan))
# Forward
try:
# SFS requires k_features <= n_features
if n <= X_scaled_df.shape[1]:
# Use X_scaled_df (DataFrame) for SFS fit
sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n, direction='forward', cv=tscv, n_jobs=-1)
# Fit on X_scaled_df (DataFrame) and get transformed numpy array
X_subset_sfs_np = sfs.fit_transform(X_scaled_df, y_scaled)
# Pass numpy array to evaluate_model
results['Forward'].append((n, *evaluate_model(LinearRegression(), X_subset_sfs_np, y_scaled, y_original, y_scaler, tscv)))
else:
results['Forward'].append((n, np.nan, np.nan, np.nan))
except Exception as e:
print(f"Forward Selection failed for n={n}: {e}")
results['Forward'].append((n, np.nan, np.nan, np.nan))
# RF Importance
idx = np.argsort(importances)[-n:]
# Select columns using index from X_scaled_df
X_subset_rf = X_scaled_df.iloc[:, idx]
# Pass DataFrame to evaluate_model
results['RandomForest'].append((n, *evaluate_model(LinearRegression(), X_subset_rf, y_scaled, y_original, y_scaler, tscv)))
# Build metrics DataFrame
dfs = []
for method, vals in results.items():
df = pd.DataFrame(vals, columns=['n_features', f'{method}_RMSE', f'{method}_MAPE', f'{method}_R2'])
dfs.append(df)
df_combined = dfs[0]
for df in dfs[1:]:
df_combined = df_combined.merge(df, on='n_features', how='outer')
return df_combined
# Assuming df_lagged is available and contains the data with lags
# Assuming target_cols is defined
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
results = {}
for target in target_cols:
# Ensure df_lagged is available and contains the target column
if 'df_lagged' in locals() and target in df_lagged.columns:
lag_cols = [f'{target}_lag1', f'{target}_lag2']
# Ensure we only try to drop columns that exist in df_lagged
cols_to_drop = [target] + [col for col in lag_cols if col in df_lagged.columns]
# Select features for X - drop target(s) and their lags
X = df_lagged.drop(columns=cols_to_drop)
# Select the current target variable and drop NaNs
y = df_lagged[target].dropna()
# Align X with the cleaned y by index
X = X.loc[y.index]
# Ensure X is not empty after aligning with y
if X.empty:
print(f"No valid data points after dropping NaNs for target: {target}. Skipping evaluation.")
results[target] = pd.DataFrame() # Store an empty DataFrame
continue
print(f"\n🔍 Evaluating for target: {target}")
# Pass X as a DataFrame and y as a Series (without NaNs)
df_metrics = find_best_features_with_metrics(X, y)
results[target] = df_metrics
# Plot metrics for the current target only if df_metrics is not empty
if not df_metrics.empty:
plot_metrics(df_metrics, target)
else:
print(f"No metrics to plot for target: {target}.")
else:
print(f"df_lagged or target column '{target}' not found. Skipping evaluation for this target.")
🔍 Evaluating for target: Life expectancy
🔍 Evaluating for target: Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.3344987559357833, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.3041213861906726, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2939728821220342, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.291560398247384, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2833807594306563, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2738431218404003, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.263577498113591, tolerance: 1.2543280301043949 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.703431344112687, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6969510646304116, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6853641250145301, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6726185397310473, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6593221832208656, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6457220310112461, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.631997326414421, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.618292309503886, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6047235924033885, tolerance: 1.5837940758962923 model = cd_fast.enet_coordinate_descent_gram(
🔍 Evaluating for target: Diabetes
Comparative Summary Table - Feature Selection with metrics (RMSE, MAPE, and R²)¶
## The best Feature Selection with different metrics TABLE
# Install tabulate if needed
!pip install tabulate
from sklearn.linear_model import Ridge, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer # Import Imputer
# Main function to calculate metrics for a given set of features
def calculate_metrics_for_features(X_df, y, feature_indices):
# Impute missing values in X_df
imputer = SimpleImputer(strategy='mean')
X_df_imputed = pd.DataFrame(imputer.fit_transform(X_df), columns=X_df.columns, index=X_df.index)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df_imputed) # Use imputed data here
y = y.values.reshape(-1, 1)
y_scaler = StandardScaler().fit(y)
y_scaled = y_scaler.transform(y).ravel()
y_original = y.ravel()
tscv = TimeSeriesSplit(n_splits=3)
X_subset = X_scaled[:, feature_indices]
y_preds, y_tests = [], []
for train_idx, test_idx in tscv.split(X_subset):
model = Ridge()
model.fit(X_subset[train_idx], y_scaled[train_idx])
pred = model.predict(X_subset[test_idx])
y_pred = y_scaler.inverse_transform(pred.reshape(-1, 1)).ravel()
y_preds.extend(y_pred)
y_tests.extend(y_original[test_idx])
return (
np.sqrt(mean_squared_error(y_tests, y_preds)),
mean_absolute_error(y_tests, y_preds),
r2_score(y_tests, y_preds)
)
# Extract Best Results per Method from the 'results' dictionary
def extract_best_per_method(results_dict, X_data_for_targets, y_data_for_targets):
summary = []
for target, target_results in results_dict.items():
df_combined = target_results['df_combined']
best_methods_info = target_results['best_methods']
X_target = X_data_for_targets[target] # Get the correct X for this target
y_target = y_data_for_targets[target] # Get the correct y for this target
for method, info in best_methods_info.items():
n_features = info['n_features']
selected_feature_names = info['features']
# Get the indices of the selected features from the X_target DataFrame columns
try:
# Ensure selected_feature_names are in the columns of X_target
valid_selected_features = [col for col in selected_feature_names if col in X_target.columns]
feature_indices = [X_target.columns.get_loc(col) for col in valid_selected_features]
except KeyError as e:
print(f"Error: Feature '{e}' not found in original DataFrame columns for target {target}, method {method}. Skipping.")
continue # Skip this combination if features are not found
if n_features > 0 and feature_indices:
# Calculate metrics using the selected features and the correct X_target and y_target
# Pass the subset of X_target using the valid_selected_features column names
rmse, mae, r2 = calculate_metrics_for_features(X_target[valid_selected_features], y_target, list(range(len(valid_selected_features)))) # Pass indices relative to the subset
summary.append({
'Target': target,
'Method': method,
'n_features': len(valid_selected_features), # Use the count of valid features
'RMSE': round(rmse, 2),
'MAE': round(mae, 2),
'R²': round(r2, 4)
})
elif n_features == 0:
# Handle case with 0 features if necessary, although typically we select at least 1
summary.append({
'Target': target,
'Method': method,
'n_features': 0,
'RMSE': np.nan, # Or a baseline metric if applicable
'MAE': np.nan,
'R²': np.nan
})
return pd.DataFrame(summary)
# Assuming df_lagged is available from previous steps
# Prepare the X and y dataframes for each target as they were used in the feature selection loop
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
X_data_for_targets = {}
y_data_for_targets = {}
if 'df_lagged' in locals():
for target in target_cols:
if target in df_lagged.columns:
lag_cols = [f'{target}_lag1', f'{target}_lag2']
cols_to_drop = [target] + [col for col in lag_cols if col in df_lagged.columns]
X = df_lagged.drop(columns=cols_to_drop)
y = df_lagged[target].dropna() # Use the y with NaNs dropped as in the previous cell
# Align X with the cleaned y by index
X = X.loc[y.index]
X_data_for_targets[target] = X
y_data_for_targets[target] = y
else:
print(f"Target column '{target}' not found in df_lagged. Cannot prepare data for this target.")
# Extract Best Results per Method
# Use the 'results' dictionary generated from the previous cell's execution and the prepared X and y data
if 'results' in locals() and results and X_data_for_targets and y_data_for_targets:
best_performance_df = extract_best_per_method(results, X_data_for_targets, y_data_for_targets)
# Print Final Table
if not best_performance_df.empty:
print("\nBest Performance per Method\n")
print(tabulate(best_performance_df, headers='keys', tablefmt='fancy_grid', showindex=False))
else:
print("\nNo best performance results to display.")
else:
print("\n'results' dictionary, X_data_for_targets, or y_data_for_targets not found or is empty. Please run the feature selection cell first and ensure data is prepared correctly.")
# export and download file
best_performance_df.to_csv("best_feature_selection_summary.csv", index=False)
from google.colab import files
files.download("best_feature_selection_summary.csv")
Requirement already satisfied: tabulate in /usr/local/lib/python3.11/dist-packages (0.9.0) Best Performance per Method ╒═════════════════════════╤══════════════╤══════════════╤════════╤═══════╤═════════╕ │ Target │ Method │ n_features │ RMSE │ MAE │ R² │ ╞═════════════════════════╪══════════════╪══════════════╪════════╪═══════╪═════════╡ │ Cardiovascular diseases │ LASSO │ 3 │ 144.69 │ 37.6 │ 0.0043 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Cardiovascular diseases │ RFE │ 3 │ 144.51 │ 38.52 │ 0.0067 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Cardiovascular diseases │ Forward │ 11 │ 145.41 │ 37.95 │ -0.0056 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Cardiovascular diseases │ RandomForest │ 3 │ 145.32 │ 37.89 │ -0.0044 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Diabetes │ LASSO │ 11 │ 3.64 │ 2.6 │ 0.4649 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Diabetes │ RFE │ 9 │ 3.64 │ 2.6 │ 0.4629 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Diabetes │ Forward │ 17 │ 3.63 │ 2.58 │ 0.4671 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Diabetes │ RandomForest │ 17 │ 3.69 │ 2.62 │ 0.45 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Life expectancy │ LASSO │ 5 │ 3.5 │ 2.69 │ 0.9133 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Life expectancy │ RFE │ 7 │ 3.5 │ 2.68 │ 0.9134 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Life expectancy │ Forward │ 17 │ 3.5 │ 2.68 │ 0.9136 │ ├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤ │ Life expectancy │ RandomForest │ 9 │ 3.51 │ 2.68 │ 0.9132 │ ╘═════════════════════════╧══════════════╧══════════════╧════════╧═══════╧═════════╛
Base on the result of the table, the following Feature Selection method and number of features will be used in this study as follows:
Life Expectancy - Forward Selection - # of features = 17
Cardiovascular Diseases - RFE - # of features = 3
Diabetes - Forward Selection - # of features = 17
According to the result from Feature Selection Summary table to determine which feature selection method is the best for the specific targets.¶
For cardiovascular diseases, RFE was selected as the preferred method due to its slightly superior RMSE and R² scores
For diabetes and life expectancy, Forward Selection provided the best overall performance.
Feature Selection¶
RFE is used for Life Cardiovascular disease, and Forward Selection is used for Diabetes and Life Expectancy, according to the result of Feature Selection Summary Table
Target Variables: Life Expectancy, Diabetes and Cardiovascular disease
# Forward Selection - Life Expectancy, Diabetes XXXXXXXXX remove
# RFE - Cardiovascular disease
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
import numpy as np
import pandas as pd
# === Clean dataset
if 'lagged' in df_lagged.columns:
df_lagged = df_lagged.drop(columns='lagged')
# === Target feature limits
forward_targets = {
'Life expectancy': 17,
'Diabetes': 17
}
rfe_target = 'Cardiovascular diseases'
rfe_num_features = 3
# === Exclude target-related columns
excluded_cols = [
col for col in df_lagged.columns
if col in list(forward_targets.keys()) + [rfe_target]
or any(col.startswith(f"{t}_lag") for t in list(forward_targets.keys()) + [rfe_target])
]
# === Forward Selection Function
def forward_selection(df, target, max_features):
print(f"\n🎯 Target: {target}")
X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
if target not in df.columns:
print(f"⚠️ '{target}' not found in columns.")
return
df_temp = X_raw.copy()
df_temp[target] = df[target]
df_temp = df_temp.dropna()
X_raw = df_temp.drop(columns=[target])
y = df_temp[target]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
remaining = list(X_scaled.columns)
selected = []
for _ in range(max_features):
rmse_scores = {}
for f in remaining:
trial = selected + [f]
model = LinearRegression()
score = cross_val_score(model, X_scaled[trial], y,
scoring='neg_mean_squared_error', cv=5)
rmse = np.mean(np.sqrt(-score))
rmse_scores[f] = rmse
best_feature = min(rmse_scores, key=rmse_scores.get)
selected.append(best_feature)
remaining.remove(best_feature)
final_model = LinearRegression()
final_rmse = np.mean(np.sqrt(-cross_val_score(final_model, X_scaled[selected], y,
scoring='neg_mean_squared_error', cv=5)))
print(f"✅ Selected ({max_features}) for {target}: {selected}")
print(f"📉 Final CV RMSE: {final_rmse:.4f}")
# === RFE Function
def rfe_selection(df, target, num_features):
print(f"\n🫀 Target: {target} (RFE with {num_features} features)")
X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
if target not in df.columns:
print(f"⚠️ '{target}' not found in columns.")
return
df_temp = X_raw.copy()
df_temp[target] = df[target]
df_temp = df_temp.dropna()
X_raw = df_temp.drop(columns=[target])
y = df_temp[target]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
model = LinearRegression()
selector = RFE(model, n_features_to_select=num_features)
selector = selector.fit(X_scaled, y)
selected = list(X_scaled.columns[selector.support_])
final_rmse = np.mean(np.sqrt(-cross_val_score(model, X_scaled[selected], y,
scoring='neg_mean_squared_error', cv=5)))
print(f"🔍 RFE Selected for {target}: {selected}")
print(f"📉 Final CV RMSE: {final_rmse:.4f}")
# === Run selections
for target, limit in forward_targets.items():
forward_selection(df_lagged, target, limit)
rfe_selection(df_lagged, rfe_target, rfe_num_features)
🎯 Target: Life expectancy ✅ Selected (17) for Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag2', 'Child mortality rate_lag2', 'Sex ratio', 'CPI', 'CPI_lag3', 'CPI_lag1', 'Sex ratio_lag3', 'CPI_lag2', 'Sex ratio_lag1', 'GDP_lag1', 'GDP_lag2', 'GDP_lag3', 'Child mortality rate_lag1', 'Median age_lag3'] 📉 Final CV RMSE: 3.3999 🎯 Target: Diabetes ✅ Selected (17) for Diabetes: ['BMI_avg_lag3', 'Income', 'GDP', 'Median age_lag3', 'CPI_lag3', 'Sex ratio_lag3', 'Cost of a healthy diet', 'Gini coefficient', 'GDP_lag3', 'Median age_lag2', 'Sex ratio', 'Income_lag1', 'Sex ratio_lag1', 'GDP_lag1', 'GDP_lag2', 'Sex ratio_lag2', 'Income_lag2'] 📉 Final CV RMSE: 3.2859 🫀 Target: Cardiovascular diseases (RFE with 3 features) 🔍 RFE Selected for Cardiovascular diseases: ['Income', 'GDP', 'BMI_avg_lag3'] 📉 Final CV RMSE: 116.1274
# Forward Selection - Life Expectancy, Diabetes
# RFE - Cardiovascular disease -
# REVISED - feature selection after train test set
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import RFE
import numpy as np
import pandas as pd
# === Clean dataset ===
if 'lagged' in df_lagged.columns:
df_lagged = df_lagged.drop(columns='lagged')
# === Target feature limits ===
forward_targets = {
'Life expectancy': 17,
'Diabetes': 17
}
rfe_target = 'Cardiovascular diseases'
rfe_num_features = 3
# === Exclude target-related columns ===
excluded_cols = [
col for col in df_lagged.columns
if col in list(forward_targets.keys()) + [rfe_target]
or any(col.startswith(f"{t}_lag") for t in list(forward_targets.keys()) + [rfe_target])
]
# === Forward Selection Function with train-test split ===
def forward_selection(df, target, max_features, test_size=0.2):
print(f"\n🎯 Target: {target}")
# Prepare data
X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
if target not in df.columns:
print(f"⚠️ '{target}' not found in columns.")
return
df_temp = X_raw.copy()
df_temp[target] = df[target]
df_temp = df_temp.dropna()
X_full = df_temp.drop(columns=[target])
y_full = df_temp[target]
# Train-test split (no shuffle for time series)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, shuffle=False)
# Scale features separately on train and test
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
remaining = list(X_train_scaled.columns)
selected = []
for _ in range(min(max_features, len(remaining))):
rmse_scores = {}
for f in remaining:
trial = selected + [f]
model = LinearRegression()
# Cross-validation only on training set
score = cross_val_score(model, X_train_scaled[trial], y_train,
scoring='neg_mean_squared_error', cv=5)
rmse = np.mean(np.sqrt(-score))
rmse_scores[f] = rmse
best_feature = min(rmse_scores, key=rmse_scores.get)
selected.append(best_feature)
remaining.remove(best_feature)
final_model = LinearRegression()
final_rmse_train = np.mean(np.sqrt(-cross_val_score(final_model, X_train_scaled[selected], y_train,
scoring='neg_mean_squared_error', cv=5)))
final_model.fit(X_train_scaled[selected], y_train)
test_preds = final_model.predict(X_test_scaled[selected])
final_rmse_test = np.sqrt(np.mean((y_test - test_preds) ** 2))
print(f"✅ Selected ({max_features}) features for {target}: {selected}")
print(f"📉 Final CV RMSE (train): {final_rmse_train:.4f}")
print(f"📊 RMSE on test set: {final_rmse_test:.4f}")
# === RFE Function with train-test split ===
def rfe_selection(df, target, num_features, test_size=0.2):
print(f"\n🫀 Target: {target} (RFE with {num_features} features)")
X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
if target not in df.columns:
print(f"⚠️ '{target}' not found in columns.")
return
df_temp = X_raw.copy()
df_temp[target] = df[target]
df_temp = df_temp.dropna()
X_full = df_temp.drop(columns=[target])
y_full = df_temp[target]
# Train-test split (no shuffle for time series)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, shuffle=False)
# Scale features separately
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
model = LinearRegression()
selector = RFE(model, n_features_to_select=num_features)
selector.fit(X_train_scaled, y_train)
selected = list(X_train_scaled.columns[selector.support_])
final_rmse_train = np.mean(np.sqrt(-cross_val_score(model, X_train_scaled[selected], y_train,
scoring='neg_mean_squared_error', cv=5)))
model.fit(X_train_scaled[selected], y_train)
test_preds = model.predict(X_test_scaled[selected])
final_rmse_test = np.sqrt(np.mean((y_test - test_preds) ** 2))
print(f"🔍 RFE Selected features for {target}: {selected}")
print(f"📉 Final CV RMSE (train): {final_rmse_train:.4f}")
print(f"📊 RMSE on test set: {final_rmse_test:.4f}")
# === Run selections ===
for target, limit in forward_targets.items():
forward_selection(df_lagged, target, limit)
rfe_selection(df_lagged, rfe_target, rfe_num_features)
🎯 Target: Life expectancy ✅ Selected (17) features for Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1', 'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate', 'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1', 'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1'] 📉 Final CV RMSE (train): 3.3434 📊 RMSE on test set: 3.7352 🎯 Target: Diabetes ✅ Selected (17) features for Diabetes: ['BMI_avg_lag3', 'Income', 'GDP_lag3', 'Median age_lag3', 'Sex ratio_lag3', 'Sex ratio', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'Sex ratio_lag2', 'Sex ratio_lag1', 'GDP_lag2', 'GDP_lag1', 'GDP', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg'] 📉 Final CV RMSE (train): 3.2852 📊 RMSE on test set: 3.3866 🫀 Target: Cardiovascular diseases (RFE with 3 features) 🔍 RFE Selected features for Cardiovascular diseases: ['Child mortality rate', 'Incomplete tertiary education', 'Child mortality rate_lag3'] 📉 Final CV RMSE (train): 73.6098 📊 RMSE on test set: 266.1863
The features have been selected for target variables as follows:
- Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1', 'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate', 'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1', 'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1']
- Diabete: ['BMI_avg_lag3', 'Income', 'GDP_lag3', 'Median age_lag3', 'Sex ratio_lag3', 'Sex ratio', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'Sex ratio_lag2', 'Sex ratio_lag1', 'GDP_lag2', 'GDP_lag1', 'GDP', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg']
- Cardiovascular disesase: ['Child mortality rate', 'Incomplete tertiary education', 'Child mortality rate_lag3']
Feature Importance¶
Feature importance quantifies how useful or valuable each feature (independent variable) is in predicting the target variable in a model.
Feature importance refers to a technique used to quantify how much each independent variable contributes to predicting the target variable in a machine learning model.
In this project analyzing global health and economic indicators to forecast outcomes like life expectancy, cardiovascular disease rates, and diabetes—using feature importance enhances interpretability by identifying which variables are most influential in driving a meaningful predictions, which can be valuable for policy recommendations or academic insights. Based on the feature selection process using Forward Selection and RFE (Table 1), distinct sets of predictors were identified for each health outcome:
For life expectancy, 16 key predictors were selected using Forward Selection, with a strong emphasis on child mortality rate and its lagged values (e.g., lag1, lag2, lag3), as well as GDP and its historical trends (lags 1–3). These results highlight the long-term influence of both early-life health indicators and macroeconomic development on longevity. Additionally, several lagged versions of the Consumer Price Index (CPI) and sex ratio were selected, indicating the importance of economic stability and population structure in shaping life expectancy over time.
For diabetes prevalence, the selected predictors (also from Forward Selection) predominantly include lifestyle and economic variables, such as BMI (lagged), income, GDP, and cost of a healthy diet. The presence of lagged features for median age, CPI, and sex ratio suggests that both aging demographics and economic accessibility to health-promoting resources (e.g., food affordability) play a substantial role in diabetes outcomes. This aligns closely with Research Question 1 and 2, emphasizing how modifiable factors — particularly income, BMI, and economic indicators — influence chronic disease prevalence.
For cardiovascular disease, the top three features identified via Recursive Feature Elimination (RFE) were income, GDP, and BMI (lagged). These results reinforce the hypothesis that economic capacity and lifestyle-related health behaviors (e.g., body weight management) are central to cardiovascular risk. Notably, this minimal yet effective feature set highlights that a few strong predictors can explain a significant portion of variation in cardiovascular outcomes.
Collectively, these findings support Research Question 1, identifying socioeconomic and lifestyle variables that most strongly influence disease prevalence. They also inform Research Question 2, by demonstrating that modifiable economic and lifestyle factors (e.g., income, BMI, CPI, diet cost) are central to variations in life expectancy and non-communicable disease rates across countries from 1950 to 2023.
# Feature Importance Table - REVISED
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
# === Setup
targets = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
methods = ['Forward', 'RFE', 'LASSO', 'Random Forest']
all_features = [
'Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1',
'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate',
'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1',
'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1','BMI_avg_lag3', 'Income',
'GDP_lag3', 'Median age_lag3', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'GDP_lag2', 'BMI_avg_lag2',
'BMI_avg_lag1', 'BMI_avg','Incomplete tertiary education'
]
# === Initialize importance table
multi_method_importance = pd.DataFrame(index=all_features,
columns=pd.MultiIndex.from_product([targets, methods]),
dtype=float).fillna(0.0)
# === Function: Standardization and prep
def prepare_data(target, features):
valid_features = [f for f in features if f in df_lagged.columns]
df_temp = df_lagged[valid_features + [target]].dropna()
X = df_temp[valid_features]
y = df_temp[target]
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
return X_scaled, y, valid_features
# === FORWARD SELECTION
def run_forward(X, y, valid_features, max_features):
selected = []
remaining = valid_features.copy()
for _ in range(min(max_features, len(remaining))):
scores = {}
for f in remaining:
trial = selected + [f]
model = LinearRegression()
neg_mse = cross_val_score(model, X[trial], y,
scoring='neg_mean_squared_error', cv=5)
rmse = np.mean(np.sqrt(-neg_mse))
scores[f] = rmse
best_feature = min(scores, key=scores.get)
selected.append(best_feature)
remaining.remove(best_feature)
for f in selected:
multi_method_importance.loc[f, (target, 'Forward')] = 1
# === RFE
def run_rfe(X, y, valid_features, num_features):
model = LinearRegression()
selector = RFE(model, n_features_to_select=num_features)
selector = selector.fit(X, y)
for f, support in zip(valid_features, selector.support_):
if support:
multi_method_importance.loc[f, (target, 'RFE')] = 1
# === LASSO
def run_lasso(X, y):
model = LassoCV(cv=5, random_state=42)
model.fit(X, y)
for i, f in enumerate(X.columns):
multi_method_importance.loc[f, (target, 'LASSO')] = round(abs(model.coef_[i]), 4)
# === RANDOM FOREST
def run_rf(X, y):
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_
for i, f in enumerate(X.columns):
multi_method_importance.loc[f, (target, 'Random Forest')] = round(importances[i], 4)
# === Run loop for all targets
for target in targets:
if target not in df_lagged.columns:
print(f"⚠️ Skipping {target} — not found in dataset.")
continue
X_scaled, y, valid = prepare_data(target, all_features)
run_forward(X_scaled, y, valid, max_features=17 if target != 'Cardiovascular diseases' else 3)
run_rfe(X_scaled, y, valid, num_features=3)
run_lasso(X_scaled, y)
run_rf(X_scaled, y)
# === Display styled table
styled_multi_table = multi_method_importance.style \
.set_caption("📊 Multi-Method Feature Importance Table") \
.format(precision=4) \
.set_table_styles([
{'selector': 'table', 'props': [('border-collapse', 'collapse'),
('border', '1px solid black')]},
{'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '4px')]}
])
# Display multi table
# Summary table combining the feature importance score with descending order for each target
for target in targets:
col_forward = (target, 'Forward')
col_rfe = (target, 'RFE')
col_lasso = (target, 'LASSO')
col_rf = (target, 'Random Forest')
combined_name = (target, 'Combined')
multi_method_importance[combined_name] = (
multi_method_importance[col_forward].fillna(0) +
multi_method_importance[col_rfe].fillna(0) +
multi_method_importance[col_lasso].fillna(0) +
multi_method_importance[col_rf].fillna(0)
)
# === Reorder for display: sort by combined score for each target
for target in targets:
sort_col = (target, 'Combined')
sorted_features = multi_method_importance.sort_values(by=sort_col, ascending=False).index
multi_method_importance = multi_method_importance.loc[sorted_features]
# === Display enhanced table
styled_combined_table = multi_method_importance.style \
.set_caption("⭐ Enhanced Feature Importance Comparison (4 Methods + Combined)") \
.format(precision=4) \
.set_table_styles([
{'selector': 'table', 'props': [('border-collapse', 'collapse'), ('border', '1px solid black')]},
{'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '5px')]}
])
display(styled_combined_table)
# Save the file
multi_method_importance.to_csv("enhanced_feature_importance_comparison.csv", index=True)
# Download the file (include full path)
from google.colab import files
files.download('enhanced_feature_importance_comparison.csv')
| Cardiovascular diseases | Diabetes | Life expectancy | Cardiovascular diseases | Diabetes | Life expectancy | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Forward | RFE | LASSO | Random Forest | Forward | RFE | LASSO | Random Forest | Forward | RFE | LASSO | Random Forest | Combined | Combined | Combined | |
| Child mortality rate | 0.0000 | 0.0000 | 0.0000 | 0.0031 | 0.0000 | 0.0000 | 0.0000 | 0.0083 | 1.0000 | 1.0000 | 18.7165 | 0.9229 | 0.0031 | 0.0083 | 21.6394 |
| Child mortality rate_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0018 | 0.0000 | 0.0000 | 0.1466 | 0.0060 | 1.0000 | 1.0000 | 7.5788 | 0.0210 | 0.0018 | 0.1526 | 9.5998 |
| Child mortality rate_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0027 | 0.0000 | 0.0000 | 0.0000 | 0.0054 | 1.0000 | 1.0000 | 0.0000 | 0.0053 | 0.0027 | 0.0054 | 2.0053 |
| Income_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0666 | 1.0000 | 0.0000 | 0.3318 | 0.0109 | 1.0000 | 0.0000 | 0.3482 | 0.0020 | 0.0666 | 1.3427 | 1.3502 |
| GDP | 0.0000 | 1.0000 | 0.0000 | 0.0878 | 1.0000 | 1.0000 | 0.3975 | 0.0254 | 1.0000 | 0.0000 | 0.3190 | 0.0021 | 1.0878 | 2.4229 | 1.3211 |
| BMI_avg | 1.0000 | 0.0000 | 0.0000 | 0.0062 | 0.0000 | 0.0000 | 0.0000 | 0.0106 | 1.0000 | 0.0000 | 0.1729 | 0.0026 | 1.0062 | 0.0106 | 1.1755 |
| Sex ratio | 0.0000 | 0.0000 | 0.0000 | 0.0076 | 1.0000 | 0.0000 | 0.0923 | 0.0054 | 1.0000 | 0.0000 | 0.1535 | 0.0024 | 0.0076 | 1.0977 | 1.1559 |
| Median age_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0060 | 1.0000 | 0.0000 | 0.3257 | 0.0281 | 1.0000 | 0.0000 | 0.1433 | 0.0073 | 0.0060 | 1.3538 | 1.1506 |
| Sex ratio_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0262 | 1.0000 | 0.0000 | 0.1618 | 0.0071 | 1.0000 | 0.0000 | 0.0936 | 0.0023 | 0.0262 | 1.1689 | 1.0959 |
| Sex ratio_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0108 | 1.0000 | 0.0000 | 0.0150 | 0.0048 | 1.0000 | 0.0000 | 0.0793 | 0.0019 | 0.0108 | 1.0198 | 1.0812 |
| Sex ratio_lag1 | 0.0000 | 0.0000 | 0.0000 | 0.0095 | 1.0000 | 0.0000 | 0.0165 | 0.0045 | 1.0000 | 0.0000 | 0.0246 | 0.0016 | 0.0095 | 1.0210 | 1.0262 |
| BMI_avg_lag3 | 1.0000 | 1.0000 | 0.0000 | 0.0558 | 1.0000 | 1.0000 | 3.3170 | 0.5842 | 1.0000 | 0.0000 | 0.0000 | 0.0018 | 2.0558 | 5.9012 | 1.0018 |
| GDP_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.1160 | 1.0000 | 0.0000 | 0.2146 | 0.0263 | 1.0000 | 0.0000 | 0.0000 | 0.0016 | 0.1160 | 1.2409 | 1.0016 |
| GDP_lag1 | 0.0000 | 0.0000 | 0.0000 | 0.0272 | 1.0000 | 0.0000 | 0.0000 | 0.0150 | 1.0000 | 0.0000 | 0.0000 | 0.0013 | 0.0272 | 1.0150 | 1.0013 |
| BMI_avg_lag1 | 1.0000 | 0.0000 | 0.0000 | 0.0098 | 0.0000 | 0.0000 | 0.0000 | 0.0083 | 1.0000 | 0.0000 | 0.0000 | 0.0013 | 1.0098 | 0.0083 | 1.0013 |
| GDP_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0206 | 1.0000 | 0.0000 | 0.0000 | 0.0172 | 1.0000 | 0.0000 | 0.0000 | 0.0012 | 0.0206 | 1.0172 | 1.0012 |
| BMI_avg_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0165 | 0.0000 | 0.0000 | 0.0000 | 0.0103 | 1.0000 | 0.0000 | 0.0000 | 0.0011 | 0.0165 | 0.0103 | 1.0011 |
| Income | 0.0000 | 1.0000 | 0.0000 | 0.0522 | 1.0000 | 1.0000 | 0.5190 | 0.0574 | 0.0000 | 0.0000 | 0.3121 | 0.0025 | 1.0522 | 2.5764 | 0.3146 |
| Unemployment Rate | 0.0000 | 0.0000 | 0.0000 | 0.0105 | 0.0000 | 0.0000 | 0.0432 | 0.0156 | 0.0000 | 0.0000 | 0.0668 | 0.0025 | 0.0105 | 0.0588 | 0.0693 |
| Cost of a healthy diet | 0.0000 | 0.0000 | 0.0000 | 0.0170 | 1.0000 | 0.0000 | 0.3548 | 0.0223 | 0.0000 | 0.0000 | 0.0505 | 0.0013 | 0.0170 | 1.3771 | 0.0518 |
| Incomplete tertiary education | 0.0000 | 0.0000 | 0.0000 | 0.2616 | 0.0000 | 0.0000 | 0.3349 | 0.0377 | 0.0000 | 0.0000 | 0.0203 | 0.0043 | 0.2616 | 0.3726 | 0.0246 |
| Unemployment Rate_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0230 | 0.0000 | 0.0000 | 0.0348 | 0.0190 | 0.0000 | 0.0000 | 0.0000 | 0.0014 | 0.0230 | 0.0538 | 0.0014 |
| Unemployment Rate_lag1 | 0.0000 | 0.0000 | 0.0000 | 0.0099 | 0.0000 | 0.0000 | 0.0002 | 0.0112 | 0.0000 | 0.0000 | 0.0000 | 0.0013 | 0.0099 | 0.0114 | 0.0013 |
| Cost of a healthy diet_lag3 | 0.0000 | 0.0000 | 0.0000 | 0.0134 | 0.0000 | 0.0000 | 0.0383 | 0.0104 | 0.0000 | 0.0000 | 0.0000 | 0.0013 | 0.0134 | 0.0487 | 0.0013 |
| Income_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0620 | 1.0000 | 0.0000 | 0.0522 | 0.0116 | 0.0000 | 0.0000 | 0.0000 | 0.0012 | 0.0620 | 1.0638 | 0.0012 |
| Cost of a healthy diet_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0174 | 1.0000 | 0.0000 | 0.0367 | 0.0054 | 0.0000 | 0.0000 | 0.0000 | 0.0011 | 0.0174 | 1.0421 | 0.0011 |
| Income_lag1 | 0.0000 | 0.0000 | 0.0000 | 0.0371 | 1.0000 | 0.0000 | 0.0774 | 0.0147 | 0.0000 | 0.0000 | 0.0000 | 0.0011 | 0.0371 | 1.0921 | 0.0011 |
| Unemployment Rate_lag2 | 0.0000 | 0.0000 | 0.0000 | 0.0124 | 0.0000 | 0.0000 | 0.0525 | 0.0104 | 0.0000 | 0.0000 | 0.0000 | 0.0011 | 0.0124 | 0.0629 | 0.0011 |
| Cost of a healthy diet_lag1 | 0.0000 | 0.0000 | 0.0000 | 0.0094 | 1.0000 | 0.0000 | 0.0000 | 0.0066 | 0.0000 | 0.0000 | 0.0000 | 0.0011 | 0.0094 | 1.0066 | 0.0011 |
Refer to the above table - The feature importance analysis across multiple selection methods—Forward Selection, Recursive Feature Elimination (RFE), LASSO, and Random Forest—reveals distinct and insightful patterns in how different variables relate to three key health outcomes: life expectancy, diabetes, and cardiovascular diseases (CVD). For life expectancy, the most dominant predictor is child mortality rate, including its lagged versions. This finding underscores a strong inverse relationship between child mortality and life expectancy, highlighting the long-term benefits of improving early childhood health. Other notable features include lagged BMI averages and socioeconomic indicators such as income and sex ratio, suggesting that both historical health trends and broader demographic factors influence longevity.
In the case of diabetes, the most influential feature is BMI_avg_lag3, indicating that higher BMI levels from three years prior are a strong predictor of diabetes prevalence. This reflects the chronic and gradual development of diabetes linked to long-term obesity. Socioeconomic factors like income, the cost of a healthy diet, and lagged income variables also emerge as important predictors, suggesting that financial access to healthy food and lifestyle conditions are significant contributors. Additionally, lagged sex ratio and income highlight the delayed impact of gender distribution and earnings on diabetes rates.
For cardiovascular diseases, BMI_avg_lag3 again stands out as a major factor, pointing to obesity’s long-term role in heart-related conditions. Uniquely, education-related features, such as incomplete tertiary education, show relevance only for CVD, indicating that educational attainment may influence cardiovascular health through awareness, healthcare access, or lifestyle choices. Economic indicators like GDP and income also contribute but to a lesser extent than for diabetes or life expectancy.
Across all targets, lagged variables consistently outperform current-year features, emphasizing the delayed effects of socioeconomic and health conditions on public health outcomes. For instance, lagged income, BMI, and child mortality often provide stronger predictive power than their contemporaneous counterparts. This suggests that interventions in health or economic policy may take several years to manifest in population health metrics, reinforcing the need for long-term planning in public health strategies.
The results also align well with potential research questions. Firstly, key predictors for each outcome were identified, showing that health outcomes are driven by a mix of socio-economic, demographic, and lagged health indicators. Secondly, the importance of lagged features strongly supports the hypothesis that delayed effects exist and can be captured through temporal modeling. Lastly, certain predictors such as BMI_avg_lag3, income, and child mortality prove robust across multiple selection methods, confirming their consistent relevance.
In summary, this feature importance analysis not only highlights the leading drivers of life expectancy, diabetes, and cardiovascular diseases but also reveals the critical role of historical data in shaping current health outcomes. These insights provide valuable guidance for public health planning, suggesting that investments in early-life health, economic accessibility, and education can yield significant long-term benefits across diverse health indicators.
Feature Importance Plot¶
# Bar Plot top features per target based on combined score
# === Custom color map
custom_colors = {
'Cardiovascular diseases': 'mediumseagreen',
'Diabetes': 'darkorange',
'Life expectancy': 'cornflowerblue'
}
targets = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
# === Plot top features per target with color
top_n = 10 # adjust as needed
for target in targets:
combined_col = (target, 'Combined')
df_top = multi_method_importance[[combined_col]].copy()
df_top.columns = ['Combined Score']
df_top = df_top.sort_values(by='Combined Score', ascending=False).head(top_n)
plt.figure(figsize=(10, 6))
sns.barplot(x='Combined Score', y=df_top.index, data=df_top, color=custom_colors[target])
plt.title(f"Top {top_n} Features for {target} (Combined Score)", fontsize=14)
plt.xlabel("Combined Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
Result of Feature Importance Score - duplicate - combined with the previous analysis¶
The combined feature importance analysis reveals distinct patterns for each health-related outcome in your dataset. For cardiovascular diseases, the most influential features appear to be BMI-related metrics, income and its lagged variations, child mortality rate, and inflation. These variables suggest that lifestyle indicators and economic stressors significantly affect cardiovascular health, with recurring trends in lagged financial indicators highlighting their cumulative impact over time.
In the case of diabetes, the leading predictors include BMI averages, income trends, age-related metrics, and sex ratio variables. These results align with clinical understandings of diabetes as a condition deeply tied to aging populations, metabolic health, and economic access to preventive care and treatment. The combined scores spotlight chronic health trends and demographic shifts as primary drivers in diabetes prevalence.
For life expectancy, broader systemic features dominate the importance rankings. Child mortality rate is the most prominent indicator, followed closely by GDP and education metrics such as incomplete tertiary education and its lagged versions. Sex ratio and Gini coefficient also show up as key contributors, reflecting the impact of demographic balance and societal inequality on long-term health outcomes. Altogether, the model highlights that life expectancy is shaped not only by individual well-being, but also by a nation's structural capacity to support its population.
Across all three targets, features such as income, education, and demographic indicators show consistent influence, affirming their central role in determining population health. This integrated approach provides a reliable foundation for future modeling, health policy evaluation, and intervention planning—rooted in both statistical rigor and public health relevance.
Stationary Check - Augmented Dickey-Fuller (ADF)¶
ADF test is used to check whether a time series is stationary, meaning that its statistical properties such as mean and variance do not change over time. Many time series models, especially ARIMA, require stationary input data. The ADF test does this by testing for the presence of a unit root. If the test returns a p-value less than 0.05, it suggests that the data is stationary and does not have a unit root, which is a favorable condition for modeling. If the p-value is higher, it indicates non-stationarity, and you may need to transform the series (e.g., using differencing) before modeling.
In this project, residual diagnostics ensure that the regression assumptions are met, increasing the reliability and interpretability of the models. The ADF test guides your decision on whether time series transformations like differencing are needed before applying models such as ARIMA. Together, these steps strengthen your modeling pipeline by validating model assumptions and ensuring the forecasts are based on appropriate statistical foundations.
# ADF
from statsmodels.tsa.stattools import adfuller
def adf_stationarity_check(series):
adf_result = adfuller(series.dropna())
return {
"ADF Statistic": adf_result[0],
"p-value": adf_result[1],
"Critical Values": adf_result[4]
}
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
for target in target_cols:
# Define y for the current target
y_train = df_lagged[target]
# Exclude only the current target from predictors
X_train = df_lagged.drop(columns=target_cols)
print(f"\n=== Diagnostics for: {target} ===")
print(residual_diagnostics(X_train, y_train)) # Ensure this function is defined elsewhere
print(adf_stationarity_check(y_train))
=== Diagnostics for: Life expectancy ===
({'LM stat': np.float64(1054.960432277746), 'BP p-value': np.float64(1.3671726945811232e-189), 'BP f-value': np.float64(23.371228853758538), 'BP f p-value': np.float64(2.265409568661906e-196)}, np.float64(0.0), Country Year
Afghanistan 1953 -13.643177
1954 -13.466028
1955 -12.853535
1956 -12.403676
1957 -11.915237
...
Zimbabwe 2019 -2.418058
2020 -2.201037
2021 -3.600304
2022 -1.790547
2023 -2.078995
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-18.53700721188469), 'p-value': np.float64(2.0998208728125792e-30), 'Critical Values': {'1%': np.float64(-3.4307207835986477), '5%': np.float64(-2.861703872110946), '10%': np.float64(-2.566857224856817)}}
=== Diagnostics for: Cardiovascular diseases ===
({'LM stat': np.float64(423.07071022958837), 'BP p-value': np.float64(1.788087659945201e-61), 'BP f-value': np.float64(9.01373461501685), 'BP f p-value': np.float64(2.1790146707553366e-62)}, np.float64(0.0), Country Year
Afghanistan 1953 -49.297596
1954 -51.587511
1955 -51.100566
1956 -51.349164
1957 -51.703506
...
Zimbabwe 2019 -24.045878
2020 -27.510980
2021 -29.353246
2022 -33.764736
2023 -35.635556
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-12.625040454273664), 'p-value': np.float64(1.5482585428290764e-23), 'Critical Values': {'1%': np.float64(-3.4307211202837773), '5%': np.float64(-2.8617040209035434), '10%': np.float64(-2.566857304056856)}}
=== Diagnostics for: Diabetes ===
({'LM stat': np.float64(1517.6712938492537), 'BP p-value': np.float64(1.9324915264761944e-286), 'BP f-value': np.float64(34.63149923847436), 'BP f p-value': np.float64(3.543825588324014e-301)}, np.float64(0.0), Country Year
Afghanistan 1953 1.091656
1954 0.994553
1955 1.014652
1956 0.996681
1957 0.985414
...
Zimbabwe 2019 -2.276379
2020 -2.298225
2021 -1.978299
2022 -1.970616
2023 -1.974694
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-12.317202881650106), 'p-value': np.float64(6.878896091647604e-23), 'Critical Values': {'1%': np.float64(-3.4307209097839584), '5%': np.float64(-2.8617039278765235), '10%': np.float64(-2.566857254539986)}}
The Augmented Dickey-Fuller (ADF) test results provide critical insight into the time series characteristics of the three key health outcomes in this study: life expectancy, diabetes, and cardiovascular diseases. All three variables demonstrate strong stationarity, as indicated by highly negative ADF statistics (e.g., -18.54 for life expectancy, -12.63 for cardiovascular diseases, and -12.32 for diabetes) and extremely low p-values (all near zero). These values are well below conventional significance thresholds (0.01 or 0.05), confirming that the time series are stationary—that is, their statistical properties such as mean and variance remain stable over time.rmance over time.
ACF and PACF plot¶
ACF (Autocorrelation Function) and PACF (Partial Autocorrelation Function) plots are visual tools used to analyze the correlation structure of time series data. They help identify patterns and dependencies between data points at different lags (time intervals) and are crucial for determining appropriate models for time series forecasting, particularly AR (Autoregressive) and MA (Moving Average) models.
# ACF and PACF plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Assuming df_lagged is your dataset and contains time-series data
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
for target in target_cols:
series = df_lagged[target].dropna()
fig, ax = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
fig.suptitle(f'ACF and PACF for {target}', fontsize=16)
plot_acf(series, lags=40, ax=ax[0])
ax[0].set_title(f'Autocorrelation (ACF) - {target}')
ax[0].set_ylabel('ACF')
plot_pacf(series, lags=40, ax=ax[1], method='ywm')
ax[1].set_title(f'Partial Autocorrelation (PACF) - {target}')
ax[1].set_ylabel('PACF')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
Based on the plot of ACF and PACF, for all targets on both the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots show significant spikes that gradually decay over time, it strongly suggests that the series contains autocorrelation — meaning past values have a measurable influence on future ones. This is particularly common in time-series data with memory or lag effects. The ACF’s slow decay pattern indicates that a moving average (MA) component may be present, while the PACF plot with very strong spikes at the first two lags points to a likely autoregressive structure of order two, also known as AR(2). In this case, the series is heavily influenced by its own values from one and two time steps prior. Together, these patterns imply that an ARIMA model would be a suitable fit, specifically one with parameters ARIMA(2, 0, q), where "p = 2" captures the autoregressive lags, "d = 0" reflects the fact that the series is stationary (as shown in the ADF test), and "q" is chosen based on how far the ACF continues to show significant autocorrelation. These insights are instrumental in designing lag-based features or selecting model architectures that are sensitive to temporal dynamics, such as ARIMA, SARIMA, or even recurrent neural networks.
In this forecasting project, three lags (lag1, lag2, and lag3) were chosen for time-dependent predictors based on both statistical diagnostics and inference validation. The Partial Autocorrelation Function (PACF) plots consistently showed strong spikes at lag 1 and lag 2, with a noticeable flattening from lag 3 onward. This suggested an autoregressive structure primarily governed by the first two time steps. However, subsequent HAC-corrected regression analysis revealed that certain lag3 features, including economic and health indicators, eg. CPI, BMI, Inflation and Income etc. were still statistically significant (p < 0.05), confirming their meaningful contribution despite weaker autocorrelation beyond lag 2. By including lag3 in the modeling framework alongside lag1 and lag2, the models captured short-term memory effects while allowing for delayed impacts that are often present in real-world socioeconomic dynamics. This decision ensures a balance between temporal relevance and statistical validity, strengthening both the explanatory power and forecasting accuracy of the models.
Residual diagnostics (heteroscedasticity, autocorrelation)¶
Residual diagnostics and the ADF (Augmented Dickey-Fuller) test are important tools in time series modeling that help ensure the models are valid, interpretable, and produce reliable forecasts.
Residual diagnostics involve analyzing the residuals means the differences between the actual values and the predicted values from your model. These diagnostics test whether your model assumptions hold, particularly in regression or forecasting models. For example, the Breusch-Pagan test checks for heteroscedasticity, which is when the variance of residuals is not constant over time. Constant variance is a key assumption in linear regression; if violated, it can lead to inefficient or biased estimates. Similarly, the Ljung-Box test assesses whether residuals are autocorrelated, which means they are correlated across time. If residuals show autocorrelation, your model has likely failed to capture some time-based structure in the data, indicating the model is underfitting or misspecified. Performing these diagnostics ensures that your model is statistically sound and that the insights or forecasts it provides are trustworthy.
# Residual Diagnostics - Test and Summary Table
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import acorr_ljungbox
import pandas as pd
import numpy as np
from tabulate import tabulate
from google.colab import files
# === Function: Residual Diagnostics for one target ===
def residual_diagnostics(X, y):
data = pd.concat([X, y], axis=1).dropna()
X_cleaned = data[X.columns]
y_cleaned = data[y.name]
X_const = sm.add_constant(X_cleaned)
model = sm.OLS(y_cleaned, X_const).fit()
residuals = model.resid
# Breusch-Pagan Test
bp_test = het_breuschpagan(residuals, X_const.loc[residuals.index])
bp_labels = ['LM stat', 'BP p-value', 'BP f-value', 'BP f p-value']
bp_results = dict(zip(bp_labels, bp_test))
# Ljung-Box Test
if len(residuals) > 10:
lb_test = acorr_ljungbox(residuals, lags=[10], return_df=True)
lb_pvalue = lb_test['lb_pvalue'].iloc[0]
else:
lb_pvalue = "Insufficient data (n < 10)"
return bp_results, lb_pvalue, residuals
# === Setup
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
diagnostics_summary = []
if 'df_lagged' in locals():
for target_col in target_cols:
print(f"\n Running diagnostics for: {target_col}")
if target_col not in df_lagged.columns:
print(f" Skipping {target_col} — not found in df_lagged")
continue
y = df_lagged[target_col]
X = df_lagged.drop(columns=target_cols, errors='ignore')
data = pd.concat([X, y], axis=1).dropna()
if data.empty:
print(" Not enough data after dropping NaNs")
continue
bp_results, lb_pvalue, residuals = residual_diagnostics(X, y)
diagnostics_summary.append({
"Target": target_col,
"Breusch-Pagan LM stat": round(bp_results['LM stat'], 4),
"BP p-value": round(bp_results['BP p-value'], 4),
"BP f-value": round(bp_results['BP f-value'], 4),
"BP f p-value": round(bp_results['BP f p-value'], 4),
"Ljung-Box p-value (lag=10)": lb_pvalue,
"Residual Mean": round(residuals.mean(), 4),
"Residual Variance": round(residuals.var(), 4)
})
# === Create summary table
diagnostics_df = pd.DataFrame(diagnostics_summary)
# Print as fancy table
print("\n📋 Residual Diagnostics Summary:")
print(tabulate(diagnostics_df, headers='keys', tablefmt='fancy_grid', showindex=False))
# === Export to CSV
filename = "residual_diagnostics_summary.csv"
diagnostics_df.to_csv(filename, index=False)
# Download the file (include full path)
from google.colab import files
files.download(filename)
else:
print("❗ df_lagged is not defined. Please run your preprocessing cell first.")
Running diagnostics for: Life expectancy Running diagnostics for: Cardiovascular diseases Running diagnostics for: Diabetes 📋 Residual Diagnostics Summary: ╒═════════════════════════╤═════════════════════════╤══════════════╤══════════════╤════════════════╤══════════════════════════════╤═════════════════╤═════════════════════╕ │ Target │ Breusch-Pagan LM stat │ BP p-value │ BP f-value │ BP f p-value │ Ljung-Box p-value (lag=10) │ Residual Mean │ Residual Variance │ ╞═════════════════════════╪═════════════════════════╪══════════════╪══════════════╪════════════════╪══════════════════════════════╪═════════════════╪═════════════════════╡ │ Life expectancy │ 1054.96 │ 0 │ 23.3712 │ 0 │ 0 │ 0 │ 11.3211 │ ├─────────────────────────┼─────────────────────────┼──────────────┼──────────────┼────────────────┼──────────────────────────────┼─────────────────┼─────────────────────┤ │ Cardiovascular diseases │ 423.071 │ 0 │ 9.0137 │ 0 │ 0 │ 0 │ 19607.4 │ ├─────────────────────────┼─────────────────────────┼──────────────┼──────────────┼────────────────┼──────────────────────────────┼─────────────────┼─────────────────────┤ │ Diabetes │ 1517.67 │ 0 │ 34.6315 │ 0 │ 0 │ 0 │ 9.8512 │ ╘═════════════════════════╧═════════════════════════╧══════════════╧══════════════╧════════════════╧══════════════════════════════╧═════════════════╧═════════════════════╛
For life expectancy, the model displays significant signs of both heteroscedasticity and autocorrelation. The Breusch-Pagan test results show extremely low p-values, indicating that the variance of residuals is not constant and may vary depending on specific predictor values. This suggests that the linear model may be missing key nonlinear components or interaction terms that could stabilize prediction behavior. Additionally, the Ljung-Box test reveals strong autocorrelation at lag 10, meaning past errors are influencing current ones — a sign that temporal patterns are not fully addressed. While the mean residual is centered at zero, which reflects no bias, the residual variance of 11.32 indicates moderate inconsistency in prediction accuracy across observations.
For diabetes, the residual profile reveals similar issues. The Breusch-Pagan test indicates pronounced heteroscedasticity, reinforcing the idea that predictor influence changes across the prediction space, particularly among metabolic or demographic variables like BMI and age. Autocorrelation is again significant according to the Ljung-Box test, implying model limitations in capturing lagged or sequential health dynamics. Although the mean residual is virtually zero — a good sign for bias — the variance of 9.85 suggests moderate prediction error dispersion, warranting further refinement in feature interaction or time-aware modeling.
The cardiovascular diseases model also shows clear heteroscedasticity, as highlighted by the Breusch-Pagan results with very low p-values. Autocorrelation is present, which points to time-based dependencies not fully captured in the linear framework. Most strikingly, the residual variance is extremely high at 19,607.45, hinting at either model instability, data skewness, or presence of outliers that are drastically affecting performance. Despite having a neutral residual mean, the model appears highly sensitive to certain predictors and may benefit from robust regression techniques or transformations to control volatility.
Overall, all three models demonstrate residual patterns that suggest issues with non-constant variance and temporal correlation. These findings recommend considering more flexible approaches such as time-series models, generalized least squares, or regression techniques that accommodate heteroscedasticity and autocorrelation directly. Enhancing each model to better capture nonlinearities or lag structures could meaningfully improve predictive reliability and interpretability.
Heteroscedasticity and Autocorrelation Consistent (HAC)¶
According the result of Residual Diagnostics indicate that the model's residuals exhibit both heteroscedasticity and autocorrelation, which violate the assumption of constant variance and independence of residuals ordinary least squares (OLS) regression.
To solve this problem by using robust standard errors (Heteroskedasticity-Autocorrelation Consistent or HAC standard errors) that account for both heteroscedasticity and autocorrelation in the variance-covariance matrix.
HAC corrected standard errors (like from Newey-West estimator) adjust the model's coefficient uncertainty when residuals are non-constant and correlated across time. It doesn't change the point estimates, but it makes the statistical tests more reliable — especially t-values, p-values, and confidence intervals.
# HAC REVISED
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Target columns
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
# Iterate through each target
for target in target_cols:
print(f"\n=== Newey-West Adjusted OLS Results for: {target} ===")
try:
# Define target and predictors
y = df_lagged[target]
all_target_cols_in_df = [col for col in target_cols if col in df_lagged.columns]
X = df_lagged.drop(columns=all_target_cols_in_df, errors='ignore')
# Combine and clean
data = pd.concat([X, y], axis=1).dropna()
X_cleaned = data[X.columns]
y_cleaned = data[y.name]
# Add constant term
X_const = sm.add_constant(X_cleaned)
# Fit OLS
model = sm.OLS(y_cleaned, X_const).fit()
# Newey-West HAC adjustment
nobs = len(y_cleaned)
maxlags = min(5, nobs - 1)
nw_model = model.get_robustcov_results(cov_type='HAC', maxlags=maxlags)
# Print summary
print(nw_model.summary())
# Residual analysis
residuals = nw_model.resid
print(f"\n📊 Residuals Summary for '{target}':")
print(pd.Series(residuals).describe())
if len(residuals) > 0 and not np.all(residuals == 0):
plt.figure(figsize=(10, 4))
plt.plot(range(len(residuals)), residuals, color='darkblue', linewidth=1)
plt.axhline(0, color='gray', linestyle='--')
plt.title(f"Residuals Over Time — {target}", fontsize=14)
plt.xlabel("Observation Index")
plt.ylabel("Residual")
plt.grid(True)
plt.tight_layout()
plt.show()
else:
print(f"⚠️ Residuals for '{target}' are empty or flat — no variation to plot.")
except Exception as e:
print(f"❌ Could not fit HAC model or plot residuals for {target}: {e}")
# export and download file
best_performance_df.to_csv("best_feature_selection_summary.csv", index=False)
from google.colab import files
files.download("best_feature_selection_summary.csv")
=== Newey-West Adjusted OLS Results for: Life expectancy ===
OLS Regression Results
==============================================================================
Dep. Variable: Life expectancy R-squared: 0.918
Model: OLS Adj. R-squared: 0.917
Method: Least Squares F-statistic: 1.278
Date: Thu, 24 Jul 2025 Prob (F-statistic): 0.276
Time: 19:55:10 Log-Likelihood: -44559.
No. Observations: 16928 AIC: 8.922e+04
Df Residuals: 16879 BIC: 8.959e+04
Df Model: 48
Covariance Type: HAC
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
const -1.387e+08 nan nan nan nan nan
Cost of a healthy diet -5.6642 2.898 -1.955 0.051 -11.344 0.016
Income -1.2168 0.285 -4.270 0.000 -1.775 -0.658
Inflation 0.1587 0.067 2.372 0.018 0.028 0.290
Child mortality rate -26.6933 1.192 -22.396 0.000 -29.029 -24.357
Unemployment Rate -0.2086 0.297 -0.702 0.483 -0.791 0.374
Incomplete tertiary education 0.2633 0.131 2.003 0.045 0.006 0.521
Gini coefficient 12.2883 15.458 0.795 0.427 -18.011 42.588
Sex ratio 2.196e+08 4.07e+09 0.054 0.957 -7.77e+09 8.2e+09
GDP 0.1197 0.081 1.469 0.142 -0.040 0.279
Median age 502.6744 576.528 0.872 0.383 -627.380 1632.729
CPI -0.0719 0.024 -3.044 0.002 -0.118 -0.026
BMI_avg -0.0414 0.039 -1.073 0.283 -0.117 0.034
Cost of a healthy diet_lag1 0.5269 1.822 0.289 0.772 -3.044 4.098
Cost of a healthy diet_lag2 -0.3998 2.777 -0.144 0.886 -5.843 5.043
Cost of a healthy diet_lag3 4.8015 3.009 1.596 0.111 -1.096 10.699
Income_lag1 0.1039 0.201 0.518 0.605 -0.289 0.497
Income_lag2 0.0726 0.244 0.297 0.767 -0.407 0.552
Income_lag3 1.1014 0.325 3.393 0.001 0.465 1.738
Inflation_lag1 0.0803 0.042 1.923 0.054 -0.002 0.162
Inflation_lag2 0.0255 0.042 0.612 0.541 -0.056 0.107
Inflation_lag3 -0.1654 0.077 -2.155 0.031 -0.316 -0.015
Child mortality rate_lag1 0.0505 1.106 0.046 0.964 -2.116 2.217
Child mortality rate_lag2 3.3981 1.241 2.739 0.006 0.966 5.830
Child mortality rate_lag3 10.0025 nan nan nan nan nan
Unemployment Rate_lag1 -0.3134 0.225 -1.396 0.163 -0.754 0.127
Unemployment Rate_lag2 0.5150 0.240 2.142 0.032 0.044 0.986
Unemployment Rate_lag3 -0.1186 0.281 -0.422 0.673 -0.669 0.432
Incomplete tertiary education_lag1 -0.0128 0.125 -0.103 0.918 -0.257 0.231
Incomplete tertiary education_lag2 0.0864 0.159 0.544 0.587 -0.225 0.398
Incomplete tertiary education_lag3 -0.3514 0.200 -1.758 0.079 -0.743 0.040
Gini coefficient_lag1 4.2493 8.833 0.481 0.630 -13.064 21.563
Gini coefficient_lag2 -2.9126 8.570 -0.340 0.734 -19.711 13.886
Gini coefficient_lag3 -18.9249 9.612 -1.969 0.049 -37.765 -0.085
Sex ratio_lag1 4.569e+07 8.87e+09 0.005 0.996 -1.73e+10 1.74e+10
Sex ratio_lag2 1.123e+08 7.4e+09 0.015 0.988 -1.44e+10 1.46e+10
Sex ratio_lag3 1.381e+08 1.97e+09 0.070 0.944 -3.73e+09 4.01e+09
GDP_lag1 0.0178 0.077 0.231 0.817 -0.133 0.169
GDP_lag2 -0.0076 0.025 -0.310 0.757 -0.056 0.041
GDP_lag3 -0.0259 0.051 -0.508 0.611 -0.126 0.074
Median age_lag1 -618.4803 1131.103 -0.547 0.585 -2835.561 1598.600
Median age_lag2 -418.3976 1146.221 -0.365 0.715 -2665.111 1828.316
Median age_lag3 503.8654 599.070 0.841 0.400 -670.375 1678.105
CPI_lag1 -0.0128 0.017 -0.771 0.441 -0.045 0.020
CPI_lag2 0.0047 0.015 0.317 0.751 -0.024 0.034
CPI_lag3 0.0772 0.016 4.783 0.000 0.046 0.109
BMI_avg_lag1 -0.0056 0.026 -0.218 0.828 -0.056 0.045
BMI_avg_lag2 -0.0167 0.028 -0.585 0.558 -0.072 0.039
BMI_avg_lag3 0.0151 0.041 0.365 0.715 -0.066 0.096
==============================================================================
Omnibus: 2958.623 Durbin-Watson: 0.136
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10571.972
Skew: -0.860 Prob(JB): 0.00
Kurtosis: 6.468 Cond. No. 6.74e+11
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
📊 Residuals Summary for 'Life expectancy':
count 16928.000000
mean 0.000003
std 3.364682
min -32.524090
25% -1.963268
50% 0.267816
75% 2.256538
max 15.502762
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
return np.sqrt(np.diag(self.cov_params()))
=== Newey-West Adjusted OLS Results for: Cardiovascular diseases ===
OLS Regression Results
===================================================================================
Dep. Variable: Cardiovascular diseases R-squared: 0.043
Model: OLS Adj. R-squared: 0.040
Method: Least Squares F-statistic: 0.01330
Date: Thu, 24 Jul 2025 Prob (F-statistic): 1.00
Time: 19:55:10 Log-Likelihood: -1.0767e+05
No. Observations: 16928 AIC: 2.154e+05
Df Residuals: 16879 BIC: 2.158e+05
Df Model: 48
Covariance Type: HAC
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
const -4.323e+09 2.21e+09 -1.955 0.051 -8.66e+09 1.02e+07
Cost of a healthy diet 12.5572 32.502 0.386 0.699 -51.151 76.265
Income 21.3556 17.251 1.238 0.216 -12.457 55.169
Inflation 1.1911 2.883 0.413 0.679 -4.460 6.842
Child mortality rate -1.6502 55.220 -0.030 0.976 -109.888 106.588
Unemployment Rate 16.5400 9.638 1.716 0.086 -2.352 35.432
Incomplete tertiary education -5.2535 7.608 -0.691 0.490 -20.166 9.659
Gini coefficient 160.8072 595.017 0.270 0.787 -1005.488 1327.102
Sex ratio 7.411e+09 1.32e+11 0.056 0.955 -2.51e+11 2.66e+11
GDP 3.3278 2.869 1.160 0.246 -2.295 8.950
Median age 1097.7356 1.82e+04 0.060 0.952 -3.45e+04 3.67e+04
CPI 0.4940 1.274 0.388 0.698 -2.002 2.990
BMI_avg -1.7352 1.782 -0.974 0.330 -5.228 1.757
Cost of a healthy diet_lag1 9.9738 nan nan nan nan nan
Cost of a healthy diet_lag2 11.7875 68.555 0.172 0.863 -122.587 146.162
Cost of a healthy diet_lag3 1.1933 37.275 0.032 0.974 -71.869 74.256
Income_lag1 0.4962 8.482 0.058 0.953 -16.130 17.123
Income_lag2 13.8650 19.042 0.728 0.467 -23.459 51.189
Income_lag3 -12.2323 13.529 -0.904 0.366 -38.751 14.287
Inflation_lag1 -0.4568 1.486 -0.307 0.759 -3.370 2.457
Inflation_lag2 -0.6003 1.574 -0.381 0.703 -3.686 2.485
Inflation_lag3 -1.7078 3.475 -0.491 0.623 -8.519 5.103
Child mortality rate_lag1 1.6709 nan nan nan nan nan
Child mortality rate_lag2 2.8105 nan nan nan nan nan
Child mortality rate_lag3 8.4775 17.638 0.481 0.631 -26.096 43.051
Unemployment Rate_lag1 -0.7751 2.350 -0.330 0.742 -5.382 3.832
Unemployment Rate_lag2 0.5853 0.323 1.811 0.070 -0.048 1.219
Unemployment Rate_lag3 -5.4142 5.114 -1.059 0.290 -15.438 4.609
Incomplete tertiary education_lag1 -0.0505 4.402 -0.011 0.991 -8.679 8.578
Incomplete tertiary education_lag2 0.4994 1.740 0.287 0.774 -2.911 3.909
Incomplete tertiary education_lag3 -2.5220 7.285 -0.346 0.729 -16.801 11.757
Gini coefficient_lag1 50.2384 134.988 0.372 0.710 -214.351 314.828
Gini coefficient_lag2 197.9054 214.176 0.924 0.355 -221.902 617.712
Gini coefficient_lag3 -761.0729 290.835 -2.617 0.009 -1331.140 -191.006
Sex ratio_lag1 1.369e+09 nan nan nan nan nan
Sex ratio_lag2 1.918e+09 nan nan nan nan nan
Sex ratio_lag3 5.383e+09 nan nan nan nan nan
GDP_lag1 0.2412 nan nan nan nan nan
GDP_lag2 -0.0958 nan nan nan nan nan
GDP_lag3 1.1461 2.557 0.448 0.654 -3.867 6.159
Median age_lag1 -1056.8683 3.13e+04 -0.034 0.973 -6.24e+04 6.03e+04
Median age_lag2 5836.9081 nan nan nan nan nan
Median age_lag3 -3728.3101 nan nan nan nan nan
CPI_lag1 0.0970 0.685 0.142 0.887 -1.247 1.441
CPI_lag2 0.1355 0.706 0.192 0.848 -1.248 1.519
CPI_lag3 1.4261 1.293 1.103 0.270 -1.108 3.961
BMI_avg_lag1 -0.1263 0.903 -0.140 0.889 -1.896 1.643
BMI_avg_lag2 -0.0982 0.921 -0.107 0.915 -1.904 1.708
BMI_avg_lag3 -3.5173 1.732 -2.031 0.042 -6.912 -0.123
==============================================================================
Omnibus: 26274.800 Durbin-Watson: 0.029
Prob(Omnibus): 0.000 Jarque-Bera (JB): 9869872.713
Skew: 10.133 Prob(JB): 0.00
Kurtosis: 119.544 Cond. No. 6.74e+11
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
📊 Residuals Summary for 'Cardiovascular diseases':
count 16928.000000
mean 0.000031
std 140.026594
min -114.217116
25% -36.951825
50% -17.545906
75% 2.043523
max 1848.846904
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
return np.sqrt(np.diag(self.cov_params()))
=== Newey-West Adjusted OLS Results for: Diabetes ===
OLS Regression Results
==============================================================================
Dep. Variable: Diabetes R-squared: 0.539
Model: OLS Adj. R-squared: 0.537
Method: Least Squares F-statistic: 2.167
Date: Thu, 24 Jul 2025 Prob (F-statistic): 0.0700
Time: 19:55:11 Log-Likelihood: -43381.
No. Observations: 16928 AIC: 8.686e+04
Df Residuals: 16879 BIC: 8.724e+04
Df Model: 48
Covariance Type: HAC
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
const -1.294e+08 nan nan nan nan nan
Cost of a healthy diet 6.7514 1.664 4.058 0.000 3.490 10.013
Income -1.0163 0.277 -3.675 0.000 -1.558 -0.474
Inflation -0.1576 0.059 -2.650 0.008 -0.274 -0.041
Child mortality rate -0.1721 0.736 -0.234 0.815 -1.616 1.271
Unemployment Rate 0.0279 0.183 0.152 0.879 -0.331 0.387
Incomplete tertiary education -0.2060 0.048 -4.335 0.000 -0.299 -0.113
Gini coefficient -11.5868 nan nan nan nan nan
Sex ratio 1.678e+08 3.41e+09 0.049 0.961 -6.52e+09 6.85e+09
GDP -0.1475 0.030 -4.868 0.000 -0.207 -0.088
Median age 503.8324 343.204 1.468 0.142 -168.883 1176.548
CPI 0.0779 0.022 3.592 0.000 0.035 0.120
BMI_avg -0.0456 0.057 -0.799 0.424 -0.157 0.066
Cost of a healthy diet_lag1 0.0382 0.929 0.041 0.967 -1.783 1.860
Cost of a healthy diet_lag2 0.8687 1.825 0.476 0.634 -2.708 4.446
Cost of a healthy diet_lag3 0.7153 2.080 0.344 0.731 -3.362 4.792
Income_lag1 -0.2537 0.185 -1.369 0.171 -0.617 0.110
Income_lag2 -0.0917 0.262 -0.350 0.726 -0.606 0.422
Income_lag3 -0.5161 0.281 -1.835 0.067 -1.067 0.035
Inflation_lag1 -0.0909 0.038 -2.403 0.016 -0.165 -0.017
Inflation_lag2 -0.0971 0.044 -2.215 0.027 -0.183 -0.011
Inflation_lag3 -0.1121 0.049 -2.288 0.022 -0.208 -0.016
Child mortality rate_lag1 -0.3867 0.523 -0.739 0.460 -1.412 0.639
Child mortality rate_lag2 0.0770 0.278 0.277 0.782 -0.469 0.623
Child mortality rate_lag3 0.6715 0.251 2.675 0.007 0.179 1.164
Unemployment Rate_lag1 -0.0285 0.107 -0.268 0.789 -0.237 0.180
Unemployment Rate_lag2 -0.0775 0.092 -0.845 0.398 -0.257 0.102
Unemployment Rate_lag3 -0.1296 0.188 -0.689 0.491 -0.498 0.239
Incomplete tertiary education_lag1 0.0421 0.092 0.457 0.648 -0.138 0.223
Incomplete tertiary education_lag2 -0.0203 0.120 -0.169 0.866 -0.256 0.216
Incomplete tertiary education_lag3 -0.1239 0.153 -0.811 0.417 -0.423 0.176
Gini coefficient_lag1 -7.6403 7.914 -0.965 0.334 -23.153 7.873
Gini coefficient_lag2 -1.8097 5.717 -0.317 0.752 -13.015 9.395
Gini coefficient_lag3 1.5183 8.841 0.172 0.864 -15.810 18.847
Sex ratio_lag1 1.984e+07 5.37e+09 0.004 0.997 -1.05e+10 1.05e+10
Sex ratio_lag2 4.239e+07 nan nan nan nan nan
Sex ratio_lag3 2.511e+08 nan nan nan nan nan
GDP_lag1 -0.0006 0.052 -0.012 0.990 -0.103 0.102
GDP_lag2 -0.0003 nan nan nan nan nan
GDP_lag3 -0.0483 0.041 -1.165 0.244 -0.130 0.033
Median age_lag1 -718.9570 695.114 -1.034 0.301 -2081.454 643.539
Median age_lag2 1089.5169 611.331 1.782 0.075 -108.757 2287.790
Median age_lag3 -996.5964 382.339 -2.607 0.009 -1746.020 -247.173
CPI_lag1 -0.0030 0.013 -0.234 0.815 -0.028 0.022
CPI_lag2 -0.0011 0.014 -0.080 0.937 -0.028 0.026
CPI_lag3 0.0204 0.023 0.882 0.378 -0.025 0.066
BMI_avg_lag1 -0.0006 0.031 -0.021 0.984 -0.060 0.059
BMI_avg_lag2 -0.0014 0.034 -0.042 0.967 -0.068 0.065
BMI_avg_lag3 0.9702 0.056 17.417 0.000 0.861 1.079
==============================================================================
Omnibus: 4120.296 Durbin-Watson: 0.108
Prob(Omnibus): 0.000 Jarque-Bera (JB): 12682.365
Skew: 1.250 Prob(JB): 0.00
Kurtosis: 6.425 Cond. No. 6.74e+11
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
📊 Residuals Summary for 'Diabetes':
count 16928.000000
mean 0.000001
std 3.138661
min -9.906907
25% -2.044869
50% -0.612431
75% 1.408433
max 24.396272
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
return np.sqrt(np.diag(self.cov_params()))
Result of HAC:
The OLS regression summary reveals varying levels of model performance across the three health outcomes. The model for Life Expectancy performs impressively well, achieving an R-squared value of 0.918 and an adjusted R-squared of 0.917. These figures suggest that approximately 92% of the variance in life expectancy across observations is explained by the model’s predictors. Such high explanatory power typically reflects that the selected variables—likely socioeconomic, demographic, and health indicators—are deeply aligned with the drivers of longevity. However, despite the strong fit, the F-statistic is relatively low (1.278) and its p-value (0.276) indicates that the model as a whole is not statistically significant at conventional levels. This contradiction may point to multicollinearity among predictors or heteroscedasticity that affects the reliability of the overall model test, even while individual coefficients remain meaningful.
For Cardiovascular Diseases, the regression model displays a much weaker performance. The R-squared is only 0.043, suggesting that the predictors explain just 4.3% of the variation in cardiovascular disease prevalence. The adjusted R-squared is nearly identical at 0.040, further confirming the low explanatory power. The F-statistic is close to zero (0.0133) and the p-value is 1.00, which definitively indicates that the model lacks statistical significance overall. These results imply that either the selected predictors are poorly suited for modeling cardiovascular outcomes or that crucial variables are missing—such as direct measures of behavior, genetic predisposition, or healthcare access.
The Diabetes model performs moderately well. An R-squared of 0.539 and adjusted R-squared of 0.537 suggest that around 54% of the variance in diabetes rates is explained by the model’s features. This is notably better than the cardiovascular model, though not nearly as strong as the life expectancy case. The F-statistic of 2.167 implies some model-wide explanatory power, and the p-value (0.070) teeters just above conventional thresholds for significance. These results indicate that while the selected predictors are relevant to diabetes prevalence—likely including variables such as BMI, age, and income—the overall structure of the model may benefit from refinement or inclusion of additional interaction terms to reach stronger statistical credibility.
To solve the highly autocorrelation and heteroscendasticity problem, we need to Refine Feature Selection Based on HAC-Corrected Results to identify and retain only statistically significant predictors (based on HAC p-values) for regression models like Random Forest (RF)
# Run HAC on each target
import statsmodels.api as sm
import pandas as pd
import numpy as np
from google.colab import files
# === HAC inference function for one target
def run_hac_inference(X, y, target_name, max_lag=10):
# Ensure X is a clean DataFrame
X_clean = pd.DataFrame(X).copy()
# Ensure y is a 1D Series
if isinstance(y, pd.DataFrame):
y_clean = y.iloc[:, 0]
else:
y_clean = pd.Series(y).squeeze()
# Add constant for intercept
X_const = sm.add_constant(X_clean)
# Fit OLS with HAC standard errors
model = sm.OLS(y_clean, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': max_lag})
# Format result into DataFrame
summary_df = pd.DataFrame({
'Target': target_name,
'Feature': model.params.index,
'Coefficient': model.params.values,
'p-value (HAC)': model.pvalues.values
})
# Remove intercept row
return summary_df[summary_df['Feature'] != 'const']
# === Setup
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
hac_results = []
stable_feature_dict = {}
# === Run HAC on each target
if 'df_lagged' in locals():
for target in target_cols:
print(f"\n📊 Processing HAC inference for: {target}")
if target not in df_lagged.columns:
print(f"⚠️ Skipping {target} — column not found")
continue
# Exclude all target columns — including current one — from predictors
X = df_lagged.drop(columns=target_cols, errors='ignore')
y = df_lagged[target]
# Drop rows with NaNs
data = pd.concat([X, y], axis=1).dropna()
if data.empty:
print(f"⛔ No clean data for {target}")
continue
X_clean = data[X.columns]
y_clean = data[[target]] # Keep as DataFrame for safe slicing
# Run HAC inference
try:
summary_df = run_hac_inference(X_clean, y_clean, target)
except Exception as e:
print(f"❌ HAC failed for {target}: {e}")
continue
hac_results.append(summary_df)
# Extract significant features
stable_features = summary_df[summary_df['p-value (HAC)'] < 0.05]['Feature'].tolist()
stable_feature_dict[target] = stable_features if stable_features else ['None']
# === Combine all results
if hac_results:
hac_summary_df = pd.concat(hac_results).reset_index(drop=True)
display(hac_summary_df)
# Export full HAC summary
hac_summary_df.to_csv("hac_inference_summary.csv", index=False)
files.download("hac_inference_summary.csv")
# Export stable features per target
stable_df = pd.DataFrame([
{'Target': tgt, 'Stable Features (p < 0.05)': ', '.join(feats)}
for tgt, feats in stable_feature_dict.items()
])
display(stable_df)
stable_df.to_csv("hac_stable_features_per_target.csv", index=False)
files.download("hac_stable_features_per_target.csv")
else:
print("⚠️ No HAC results to export.")
else:
print("❗ 'df_lagged' not found. Please ensure your dataset is loaded.")
# export and download file
hac_summary_df.to_csv("hac_summary_df.csv", index=False)
from google.colab import files
files.download("hac_summary_df.csv")
📊 Processing HAC inference for: Life expectancy 📊 Processing HAC inference for: Diabetes 📊 Processing HAC inference for: Cardiovascular diseases
| Target | Feature | Coefficient | p-value (HAC) | |
|---|---|---|---|---|
| 0 | Life expectancy | Cost of a healthy diet | -5.664228 | 1.070727e-02 |
| 1 | Life expectancy | Income | -1.216788 | 3.570121e-05 |
| 2 | Life expectancy | Inflation | 0.158719 | 5.512448e-03 |
| 3 | Life expectancy | Child mortality rate | -26.693303 | 3.481127e-91 |
| 4 | Life expectancy | Unemployment Rate | -0.208619 | 3.561879e-01 |
| ... | ... | ... | ... | ... |
| 139 | Cardiovascular diseases | CPI_lag2 | 0.135534 | 8.529276e-01 |
| 140 | Cardiovascular diseases | CPI_lag3 | 1.426133 | 3.418988e-01 |
| 141 | Cardiovascular diseases | BMI_avg_lag1 | -0.126308 | 8.424859e-01 |
| 142 | Cardiovascular diseases | BMI_avg_lag2 | -0.098199 | 9.188838e-01 |
| 143 | Cardiovascular diseases | BMI_avg_lag3 | -3.517348 | 4.536951e-02 |
144 rows × 4 columns
| Target | Stable Features (p < 0.05) | |
|---|---|---|
| 0 | Life expectancy | Cost of a healthy diet, Income, Inflation, Chi... |
| 1 | Diabetes | Cost of a healthy diet, Income, Inflation, GDP... |
| 2 | Cardiovascular diseases | BMI_avg_lag3 |
# Run HAC on each Target - REVISED - Scaled
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from google.colab import files
# === HAC inference function for one target
def run_hac_inference(X, y, target_name, max_lag=10):
# Ensure X is a clean DataFrame
X_clean = pd.DataFrame(X).copy()
# Ensure y is a 1D Series
if isinstance(y, pd.DataFrame):
y_clean = y.iloc[:, 0]
else:
y_clean = pd.Series(y).squeeze()
# Add constant for intercept
X_const = sm.add_constant(X_clean)
# Fit OLS with HAC standard errors
model = sm.OLS(y_clean, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': max_lag})
# Format result into DataFrame
summary_df = pd.DataFrame({
'Target': target_name,
'Feature': model.params.index,
'Coefficient': model.params.values,
'p-value (HAC)': model.pvalues.values
})
# Remove intercept row
return summary_df[summary_df['Feature'] != 'const']
# === Setup
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
hac_results = []
stable_feature_dict = {}
# === Run HAC on each target
if 'df_lagged' in locals():
for target in target_cols:
print(f"\n📊 Processing HAC inference for: {target}")
if target not in df_lagged.columns:
print(f"⚠️ Skipping {target} — column not found")
continue
# Exclude all target columns — including current one — from predictors
X = df_lagged.drop(columns=target_cols, errors='ignore')
y = df_lagged[target]
# Drop rows with NaNs
data = pd.concat([X, y], axis=1).dropna()
if data.empty:
print(f"⛔ No clean data for {target}")
continue
X_clean = data[X.columns]
y_clean = data[[target]] # Keep as DataFrame for safe slicing
# === Feature Scaling: Standardize X before regression
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_clean), columns=X_clean.columns, index=X_clean.index)
# Run HAC inference
try:
summary_df = run_hac_inference(X_scaled, y_clean, target)
except Exception as e:
print(f"❌ HAC failed for {target}: {e}")
continue
hac_results.append(summary_df)
# Extract stable features (p < 0.05)
stable_features = summary_df[summary_df['p-value (HAC)'] < 0.05]['Feature'].tolist()
stable_feature_dict[target] = stable_features if stable_features else ['None']
# === Combine all results
if hac_results:
hac_summary_df = pd.concat(hac_results).reset_index(drop=True)
display(hac_summary_df)
# Export full HAC summary
hac_summary_df.to_csv("hac_inference_summary_scaled.csv", index=False)
files.download("hac_inference_summary_scaled.csv")
# Export stable features per target
stable_df = pd.DataFrame([
{'Target': tgt, 'Stable Features (p < 0.05)': ', '.join(feats)}
for tgt, feats in stable_feature_dict.items()
])
display(stable_df)
stable_df.to_csv("hac_stable_features_scaled.csv", index=False)
files.download("hac_stable_features_scaled.csv")
else:
print("⚠️ No HAC results to export.")
else:
print("❗ 'df_lagged' not found. Please ensure your dataset is loaded.")
# Optional: Export summary again for backup
if 'hac_summary_df' in locals():
hac_summary_df.to_csv("hac_summary_df_scaled.csv", index=False)
files.download("hac_summary_df_scaled.csv")
📊 Processing HAC inference for: Life expectancy 📊 Processing HAC inference for: Diabetes 📊 Processing HAC inference for: Cardiovascular diseases
| Target | Feature | Coefficient | p-value (HAC) | |
|---|---|---|---|---|
| 0 | Life expectancy | Cost of a healthy diet | -0.326902 | 3.181385e-02 |
| 1 | Life expectancy | Income | -0.650294 | 1.440014e-05 |
| 2 | Life expectancy | Inflation | 0.106669 | 1.227467e-02 |
| 3 | Life expectancy | Child mortality rate | -22.724780 | 4.466047e-71 |
| 4 | Life expectancy | Unemployment Rate | -0.113464 | 4.378930e-01 |
| ... | ... | ... | ... | ... |
| 139 | Cardiovascular diseases | CPI_lag2 | 0.602719 | 7.943913e-01 |
| 140 | Cardiovascular diseases | CPI_lag3 | 6.349755 | 1.789535e-01 |
| 141 | Cardiovascular diseases | BMI_avg_lag1 | -0.449466 | 8.551041e-01 |
| 142 | Cardiovascular diseases | BMI_avg_lag2 | -0.349445 | 8.854279e-01 |
| 143 | Cardiovascular diseases | BMI_avg_lag3 | -12.517319 | 2.922596e-02 |
144 rows × 4 columns
| Target | Stable Features (p < 0.05) | |
|---|---|---|
| 0 | Life expectancy | Cost of a healthy diet, Income, Inflation, Chi... |
| 1 | Diabetes | Cost of a healthy diet, Income, Inflation, GDP... |
| 2 | Cardiovascular diseases | Unemployment Rate, Sex ratio, Gini coefficient... |
Your HAC regression results provide a statistically grounded view of how different predictors relate to the three health outcomes — Life Expectancy, Diabetes, and Cardiovascular Diseases — after adjusting for autocorrelation and heteroscedasticity. For Life Expectancy, several variables emerged as statistically significant, including Child mortality rate (extremely strong negative relationship with a near-zero p-value), Income, Inflation, GDP, CPI, and lagged features like Unemployment Rate_lag2, Cost of a healthy diet_lag3, Inflation_lag1, and Income_lag3. These results suggest that socioeconomic and health indicators are strongly associated with longevity and can be confidently included in the forecasting model. For Diabetes, stable predictors included Income, GDP, Inflation, CPI, and Cost of a healthy diet, with especially strong significance for BMI_avg_lag3 (p ≈ 3.14E-56), which reflects a deep connection between body mass trends and diabetes outcomes. However, Cardiovascular Diseases showed relatively weak statistical signal across most features, with high p-values and low explanatory strength — indicating poor model fit. Only a few variables, such as BMI_avg_lag3, reached borderline significance.
In summary, the HAC-adjusted regression confirms the reliability of several predictors for Life Expectancy and Diabetes, providing a stable foundation for retraining your forecast models. Cardiovascular Diseases, on the other hand, lacks robust explanatory variables and may need to be reported as exploratory, or supplemented with additional features if available. Your next step would be to extract the statistically significant features (e.g., p-value < 0.05) and use them to retrain the Random Forest or regression-based models for time-series forecasting. These filtered predictors will reduce noise, improve interpretability, and enhance predictive reliability across your rolling validation framework.
Extract Stable Predictors (p < 0.05)¶
# Extract Stable Predictor - REVISED - Table
import pandas as pd
from tabulate import tabulate
from google.colab import files
# === Settings
significance_threshold = 0.05
grouped_results = []
# === Check if HAC summary exists
if 'hac_summary_df' not in globals():
raise ValueError("⚠️ Please run the HAC regression first to generate 'hac_summary_df'.")
# === Extract stable features for each target
for target in hac_summary_df['Target'].unique():
df_target = hac_summary_df[hac_summary_df['Target'] == target]
stable_df = df_target[df_target['p-value (HAC)'] < significance_threshold]
if stable_df.empty:
grouped_results.append({
"Target": target,
"Stable Predictor": "None",
"p-value": ""
})
else:
for _, row in stable_df.sort_values(by='p-value (HAC)').iterrows():
grouped_results.append({
"Target": target,
"Stable Predictor": row['Feature'],
"p-value < 0.05": f"{row['p-value (HAC)']:.4g}"
})
# === Create summary DataFrame
stable_summary_df = pd.DataFrame(grouped_results)
# === Display as nicely formatted table
print("\n📌 Stable Predictors (p < 0.05) by Target:\n")
print(tabulate(stable_summary_df, headers='keys', tablefmt='fancy_grid', showindex=False))
# === Export to CSV
csv_filename = "hac_stable_predictors_summary.csv"
stable_summary_df.to_csv(csv_filename, index=False)
files.download(csv_filename)
📌 Stable Predictors (p < 0.05) by Target: ╒═════════════════════════╤═══════════════════════════╤══════════════════╕ │ Target │ Stable Predictor │ p-value < 0.05 │ ╞═════════════════════════╪═══════════════════════════╪══════════════════╡ │ Life expectancy │ Child mortality rate │ 4.466e-71 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Child mortality rate_lag3 │ 8.066e-19 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Income │ 1.44e-05 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Income_lag3 │ 0.0001574 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Child mortality rate_lag2 │ 0.0004716 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ CPI_lag3 │ 0.0009038 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ CPI │ 0.002188 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Unemployment Rate_lag2 │ 0.006369 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Sex ratio │ 0.01085 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Inflation_lag3 │ 0.01139 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Inflation │ 0.01227 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ GDP │ 0.02303 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Inflation_lag1 │ 0.03167 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Cost of a healthy diet │ 0.03181 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Life expectancy │ Sex ratio_lag2 │ 0.04604 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ BMI_avg_lag3 │ 1.009e-66 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ CPI │ 0.0002375 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Income │ 0.0004172 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ GDP │ 0.0007287 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Cost of a healthy diet │ 0.001212 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Median age_lag3 │ 0.001281 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Inflation │ 0.006302 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Inflation_lag1 │ 0.006318 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Sex ratio_lag3 │ 0.008815 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Inflation_lag2 │ 0.009287 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Median age_lag2 │ 0.01103 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Inflation_lag3 │ 0.03887 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Diabetes │ Income_lag3 │ 0.04485 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Cardiovascular diseases │ Sex ratio │ 0.005065 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Cardiovascular diseases │ BMI_avg_lag3 │ 0.02923 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Cardiovascular diseases │ Gini coefficient_lag3 │ 0.03251 │ ├─────────────────────────┼───────────────────────────┼──────────────────┤ │ Cardiovascular diseases │ Unemployment Rate │ 0.03382 │ ╘═════════════════════════╧═══════════════════════════╧══════════════════╛
# Stable Predictors Bar Charts
import matplotlib.pyplot as plt
import seaborn as sns
# Convert p-values to float for plotting
stable_summary_df['p-value'] = stable_summary_df['p-value < 0.05'].astype(float)
# Set plot style
sns.set(style="whitegrid")
# Get list of unique targets
targets = stable_summary_df['Target'].unique()
# Create one bar chart per target
for target in targets:
plt.figure(figsize=(10, 6))
target_df = stable_summary_df[stable_summary_df['Target'] == target].copy()
# Sort by p-value (lowest = most significant)
target_df = target_df.sort_values('p-value', ascending=True)
# Barplot of -log10(p-value) for visibility
sns.barplot(
data=target_df,
x=-np.log10(target_df['p-value']), # higher bar = more significant
y='Stable Predictor',
palette='viridis'
)
plt.title(f"Stable Predictors for {target} (p < 0.05)", fontsize=14)
plt.xlabel('-log10(p-value)')
plt.ylabel('Predictor')
plt.tight_layout()
plt.show()
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
Retrain Models per Targets and Forecast Evaluation¶
# Retrain Models per Targets and Forecast Evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
from tabulate import tabulate
# === Recreate stable_features_dict from hac_summary_df ===
# Assuming hac_summary_df is available from the previous cell's execution
stable_features_dict = {}
if 'hac_summary_df' in locals() and not hac_summary_df.empty:
significance_threshold = 0.05 # Define the significance threshold
for target in hac_summary_df['Target'].unique():
df_target = hac_summary_df[hac_summary_df['Target'] == target]
stable_df = df_target[df_target['p-value (HAC)'] < significance_threshold]
stable_features_dict[target] = stable_df['Feature'].tolist()
else:
print("Error: hac_summary_df not found or is empty. Cannot proceed with retraining.")
# You might want to exit or raise an error here if hac_summary_df is essential
# For now, we'll let the loop below handle the case where stable_features_dict is empty
# Assuming df_lagged is available from previous steps
forecast_results = []
# Sort chronologically if 'Year' exists
df_sorted = df_lagged.sort_values(by='Year') if 'Year' in df_lagged.columns else df_lagged.copy()
# Check if stable_features_dict was successfully populated
if stable_features_dict:
for target, features in stable_features_dict.items():
if not features: # Check if the list of features is empty for this target
print(f"⚠️ Skipping '{target}' — no stable predictors found.")
continue
print(f"\n✅ Training model for: {target}")
# Prepare data
# Ensure only existing features are selected from df_sorted
existing_features = [f for f in features if f in df_sorted.columns]
if not existing_features:
print(f"⚠️ Skipping '{target}' — none of the selected stable features exist in the DataFrame.")
continue
df_subset = df_sorted[existing_features + [target]].dropna()
if df_subset.empty:
print(f"⛔ Skipping '{target}' — no data available after dropping NaNs for selected features.")
continue
X = df_subset[existing_features] # Use only existing features
y = df_subset[target]
# Time-based train-test split (80/20)
split_idx = int(len(df_subset) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
# Ensure train and test sets are not empty
if X_train.empty or X_test.empty:
print(f"⛔ Skipping '{target}' — Train or Test set is empty after splitting.")
continue
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # Manual root
r2 = r2_score(y_test, y_pred)
print(f"📉 RMSE: {rmse:.3f}")
print(f"📊 R² Score: {r2:.3f}")
# Save results
forecast_results.append({
'Target': target,
'RMSE': round(rmse, 3),
'R²': round(r2, 3),
'Stable Features Count': len(existing_features), # Use count of existing features
'Stable Features': ', '.join(existing_features) # Use list of existing features
})
# 📋 Summary table
if forecast_results:
results_df = pd.DataFrame(forecast_results)
print("\n🔍 Forecast Model Performance Summary (using stable features):")
print(tabulate(results_df, headers="keys", tablefmt="fancy_grid", showindex=False))
# Save file as CSV
filename = "forecast_model_performance_summary_stable_features.csv"
results_df.to_csv(filename, index=False)
print(f"\n⬇️ Downloading {filename}")
from google.colab import files
files.download(filename)
else:
print("\nNo forecast results were generated.")
else:
print("stable_features_dict was not populated. Please check previous steps.")
✅ Training model for: Life expectancy 📉 RMSE: 3.268 📊 R² Score: 0.917 ✅ Training model for: Diabetes 📉 RMSE: 3.289 📊 R² Score: 0.349 ✅ Training model for: Cardiovascular diseases 📉 RMSE: 263.302 📊 R² Score: 0.003 🔍 Forecast Model Performance Summary (using stable features): ╒═════════════════════════╤═════════╤═══════╤═════════════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕ │ Target │ RMSE │ R² │ Stable Features Count │ Stable Features │ ╞═════════════════════════╪═════════╪═══════╪═════════════════════════╪═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡ │ Life expectancy │ 3.268 │ 0.917 │ 15 │ Cost of a healthy diet, Income, Inflation, Child mortality rate, Sex ratio, GDP, CPI, Income_lag3, Inflation_lag1, Inflation_lag3, Child mortality rate_lag2, Child mortality rate_lag3, Unemployment Rate_lag2, Sex ratio_lag2, CPI_lag3 │ ├─────────────────────────┼─────────┼───────┼─────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ Diabetes │ 3.289 │ 0.349 │ 13 │ Cost of a healthy diet, Income, Inflation, GDP, CPI, Income_lag3, Inflation_lag1, Inflation_lag2, Inflation_lag3, Sex ratio_lag3, Median age_lag2, Median age_lag3, BMI_avg_lag3 │ ├─────────────────────────┼─────────┼───────┼─────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ Cardiovascular diseases │ 263.302 │ 0.003 │ 4 │ Unemployment Rate, Sex ratio, Gini coefficient_lag3, BMI_avg_lag3 │ ╘═════════════════════════╧═════════╧═══════╧═════════════════════════╧═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╛ ⬇️ Downloading forecast_model_performance_summary_stable_features.csv
The model results show promising insights for two out of three health targets. For Life Expectancy, your retrained model performs exceptionally well — achieving an RMSE of 3.15 and an R² of 0.923, which means over 92% of the variation is successfully explained using the refined HAC-stable predictors. These include socioeconomic indicators such as Income, GDP, CPI, and crucial health-related factors like Child mortality rate and several significant lag features. For Diabetes, the model performs moderately well, with an RMSE of 3.50 and an R² of 0.261. While the predictive power is limited, it still identifies useful relationships between the target and predictors like BMI trends, Income, and Inflation. On the other hand, the model for Cardiovascular Diseases performs poorly, with a high RMSE of 268.93 and a negative R² of -0.04, meaning the model performs worse than simply predicting the average. This outcome strongly suggests that the available features do not explain cardiovascular outcomes effectively, and the model should be flagged as exploratory or omitted entirely from forecasting.
All three target variables—life expectancy, cardiovascular diseases, and diabetes—demonstrate clear signs of stationarity, as revealed by the Augmented Dickey-Fuller (ADF) test results. For life expectancy, the ADF statistic of –18.54 and an extremely small p-value indicate that the series is strongly stationary, with stable mean and variance over time. Similarly, cardiovascular diseases show an ADF statistic of -12.63 and a highly significant p-value, confirming stationarity in that time series as well. Diabetes follows the same trend, with an ADF statistic of –12.32 and a p-value well below conventional thresholds, again rejecting the null hypothesis of non-stationarity. In each case, the test statistic is more negative than the critical values at 1%, 5%, and 10% significance levels. This means you can confidently model these variables using standard regression techniques without having to transform them to achieve stationarity. It also reinforces that observed patterns are relatively stable across time, which supports both interpretation and forecasting with traditional linear models.
Time Series Forecasting with Walk-Forward Validation using ARIMA, Prophet, and Random Forests (RMSE Evaluation)¶
Rolling Forecast Validation (Walk-Forward)¶
Rolling or walk-forward forecast validation is a technique used to evaluate the performance of time series forecasting models in a way that closely resembles real-world forecasting scenarios. Its core purpose is to test how well a model predicts future values when only past information is available at each step. In this approach, the model is initially trained on historical data from 1950 to 2020, and then used to predict the next time step from 2021 to 2023. After this prediction, the actual observed value for 2021-2023 is added to the training set, and the model is retrained to predict 2024-2074. This process is repeated step-by-step, moving forward through time.
This method avoids data leakage by ensuring that the model is never trained on data from the future. It provides a realistic simulation of how forecasts are generated and evaluated in real-time decision-making. Additionally, it allows the model to adapt to potential non-stationarity in the data by retraining as new information becomes available. Overall, rolling forecast validation produces a more reliable estimate of model performance on unseen data, which is especially important in dynamic domains like health, economics, and climate modeling where past patterns may not hold indefinitely into the future.
10 diversity Countries have been selected by their income level for rolling forecast validation (Walk-Forward) as follows:
- United States - High-income
- Germany - High-income
- Japan - High-income
- Brazil - Upper-middle-income
- India - Lower-middle-income
- Indonesia - Lower-middle-income
- Nigeria - Low-income
- Kenya - Low-income
- Mexico - Upper-middle-income
- Bangladesh - Low-middle-income
# Rolling Forecast - Walk Forward Validation
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# === Setup ===
selected_countries = [
'United States', 'Germany', 'Japan', 'Brazil', 'India',
'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
target_columns = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
selected_features_dict = {
'Life expectancy': [
'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
'Gini coefficient_lag3', 'Unemployment Rate_lag1'
],
'Cardiovascular diseases': [
'BMI_avg_lag3'
],
'Diabetes': [
'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
'Inflation_lag3'
]
}
start_train = 1950
end_train = 2020
real_eval_period = [2021, 2022, 2023]
# Forecast horizon starts from 2024 and goes till 2074
forecast_horizon = list(range(2024, 2074))
# === Create future rows for years 2024 to 2073
future_rows = []
for country in df_combined_with_country['Country'].unique():
for year in forecast_horizon:
future_rows.append({'Country': country, 'Year': year})
df_future = pd.DataFrame(future_rows)
df_forecast_ready = pd.concat([df_combined_with_country, df_future], ignore_index=True)
df_forecast_ready['Year'] = df_forecast_ready['Year'].astype(int)
# === Impute missing values across all countries and years
df_forecast_ready = (
df_forecast_ready
.sort_values(['Country', 'Year'])
.groupby('Country', group_keys=False)
.apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
.reset_index(drop=True)
)
# === Initialize summary table
predictions_summary = []
# === Forecast Loop ===
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')
for target in target_columns:
print(f"\n {country} — {target}")
if target not in df_country.columns:
print(" Target missing")
continue
features = selected_features_dict.get(target, [])
available_features = [f for f in features if f in df_country.columns]
if not available_features:
print(" No usable features found")
continue
df_train = df_country[df_country['Year'].between(start_train, end_train)]
df_eval_real = df_country[df_country['Year'].isin(real_eval_period)]
# === ARIMA ===
arima_rmse = None
try:
df_train_arima = df_train[[target]].copy()
df_train_arima.index = pd.date_range(start=f'{start_train}', periods=len(df_train_arima), freq='YE')
model_arima = ARIMA(df_train_arima, order=(1, 1, 1)).fit()
# Real evaluation
pred_real = model_arima.predict(start=len(df_train_arima), end=len(df_train_arima)+len(df_eval_real)-1)
actual_real = df_eval_real[target].values
arima_rmse = np.sqrt(mean_squared_error(actual_real, pred_real))
# Forecast for 2024-2073
arima_forecast = model_arima.predict(start=len(df_train_arima), end=len(df_train_arima) + len(forecast_horizon) - 1)
print(f"📉 ARIMA RMSE: {arima_rmse:.2f}")
except Exception as e:
print(f" ARIMA error: {e}")
# === Prophet ===
prophet_rmse = None
try:
prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model_prophet = Prophet()
model_prophet.fit(prophet_df)
future_years = real_eval_period + forecast_horizon
future_dates = pd.DataFrame({'ds': pd.to_datetime(future_years, format='%Y')})
forecast_prophet = model_prophet.predict(future_dates)
# Real evaluation
pred_real = forecast_prophet[forecast_prophet['ds'].dt.year.isin(real_eval_period)]['yhat'].values
actual_real = df_eval_real[target].values
prophet_rmse = np.sqrt(mean_squared_error(actual_real, pred_real))
# Forecast for 2024-2073
prophet_forecast = forecast_prophet[forecast_prophet['ds'].dt.year.isin(forecast_horizon)]
print(f" Prophet RMSE: {prophet_rmse:.2f}")
except Exception as e:
print(f" Prophet error: {e}")
#### Random Forest ####
#from google.colab import data_table
#data_table.DataTable(df_forecast)
rf_rmse, rf_forecast = None, [None] * len(df_future)
try:
X = df_country[available_features]
y = df_country[target]
X_train = X[df_country['Year'].between(start_train, end_train)]
y_train = y[df_country['Year'].between(start_train, end_train)]
X_eval = X[df_country['Year'].isin(real_eval_period)]
y_eval = y[df_country['Year'].isin(real_eval_period)]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred_eval = model.predict(X_eval)
rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
X_forecast = X[df_country['Year'].isin(forecast_horizon)]
if not X_forecast.isnull().any(axis=1).any():
rf_forecast = model.predict(X_forecast).tolist()
except:
pass
# === Append to summary ===
predictions_summary.append({
"Country": country,
"Target": target,
"ARIMA_RMSE": round(arima_rmse, 4) if arima_rmse is not None else None,
"Prophet_RMSE": round(prophet_rmse, 4) if prophet_rmse is not None else None,
"RF_RMSE": round(rf_rmse, 4) if rf_rmse is not None else None
})
# === Final Summary Table ===
df_forecast_validation_summary = pd.DataFrame(predictions_summary)
df_forecast_validation_summary = df_forecast_validation_summary[[
"Country", "Target",
"ARIMA_RMSE", "Prophet_RMSE", "RF_RMSE"
]]
print("\n 📋 Rolling Forecast Validation Summary:")
print(df_forecast_validation_summary)
# Export summary
df_forecast_validation_summary.to_csv("forecast_summary.csv", index=False)
# Download to your computer
from google.colab import files
files.download("forecast_summary.csv")
/tmp/ipython-input-37-3481924030.py:57: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
.apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
/tmp/ipython-input-37-3481924030.py:57: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
United States — Life expectancy 📉 ARIMA RMSE: 2.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hv2ktus1.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/oe_r3aoy.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47361', 'data', 'file=/tmp/tmprjkocm4m/hv2ktus1.json', 'init=/tmp/tmprjkocm4m/oe_r3aoy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3c9z48hr/prophet_model-20250723141100.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:00 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:02 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 1.56 United States — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sa5_nkw_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pn5w6eso.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25332', 'data', 'file=/tmp/tmprjkocm4m/sa5_nkw_.json', 'init=/tmp/tmprjkocm4m/pn5w6eso.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelyycjtjmt/prophet_model-20250723141104.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:04 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.19
14:11:04 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 11.97 United States — Diabetes
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sghhl48c.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ahdz28xn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97841', 'data', 'file=/tmp/tmprjkocm4m/sghhl48c.json', 'init=/tmp/tmprjkocm4m/ahdz28xn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modely7ekj70r/prophet_model-20250723141105.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:05 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.01
14:11:05 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.49 Germany — Life expectancy
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k2qniys2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/atdkbrr0.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=21012', 'data', 'file=/tmp/tmprjkocm4m/k2qniys2.json', 'init=/tmp/tmprjkocm4m/atdkbrr0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnw233o6r/prophet_model-20250723141106.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.47
14:11:07 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.61 Germany — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ijt_1k_k.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xt8pr25r.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48144', 'data', 'file=/tmp/tmprjkocm4m/ijt_1k_k.json', 'init=/tmp/tmprjkocm4m/xt8pr25r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelog7no6i2/prophet_model-20250723141108.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:08 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.43
14:11:09 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.13 Germany — Diabetes
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j6behbrs.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/32yl0owc.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97012', 'data', 'file=/tmp/tmprjkocm4m/j6behbrs.json', 'init=/tmp/tmprjkocm4m/32yl0owc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelq_r770jn/prophet_model-20250723141109.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:09 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.00
14:11:09 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.76 Japan — Life expectancy
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/df3mky2k.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/i0y34wue.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77900', 'data', 'file=/tmp/tmprjkocm4m/df3mky2k.json', 'init=/tmp/tmprjkocm4m/i0y34wue.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzl5v553j/prophet_model-20250723141110.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.64
14:11:10 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.58 Japan — Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v4jzd52u.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eys9n03c.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=52127', 'data', 'file=/tmp/tmprjkocm4m/v4jzd52u.json', 'init=/tmp/tmprjkocm4m/eys9n03c.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modela5jsybat/prophet_model-20250723141111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.55
14:11:11 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Prophet RMSE: 7.69 Japan — Diabetes 📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sqo53_sk.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/nw3ja3jo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=30846', 'data', 'file=/tmp/tmprjkocm4m/sqo53_sk.json', 'init=/tmp/tmprjkocm4m/nw3ja3jo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelao3gb3zi/prophet_model-20250723141111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 1.84 Brazil — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_mkeoghc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/5d68fu18.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47694', 'data', 'file=/tmp/tmprjkocm4m/_mkeoghc.json', 'init=/tmp/tmprjkocm4m/5d68fu18.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelemn_5pxf/prophet_model-20250723141112.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 3.01
14:11:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.19 Brazil — Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k4tsl6ub.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/brdxiy8q.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12825', 'data', 'file=/tmp/tmprjkocm4m/k4tsl6ub.json', 'init=/tmp/tmprjkocm4m/brdxiy8q.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelb1p2eqd8/prophet_model-20250723141113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.82
14:11:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 6.55 Brazil — Diabetes 📉 ARIMA RMSE: 0.00
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2tv9vzvy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/34ls6e25.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19841', 'data', 'file=/tmp/tmprjkocm4m/2tv9vzvy.json', 'init=/tmp/tmprjkocm4m/34ls6e25.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelic3ef0zy/prophet_model-20250723141113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.19 India — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w3ahvdnj.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k_004rg9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7967', 'data', 'file=/tmp/tmprjkocm4m/w3ahvdnj.json', 'init=/tmp/tmprjkocm4m/k_004rg9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpwl_gp6r/prophet_model-20250723141116.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:16 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.97
14:11:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.48 India — Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bdnme65z.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cn7m_qtq.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72093', 'data', 'file=/tmp/tmprjkocm4m/bdnme65z.json', 'init=/tmp/tmprjkocm4m/cn7m_qtq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7dhx381j/prophet_model-20250723141117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 19.66
14:11:17 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 37.42 India — Diabetes
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ji15abxw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qaexe7ak.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42659', 'data', 'file=/tmp/tmprjkocm4m/ji15abxw.json', 'init=/tmp/tmprjkocm4m/qaexe7ak.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7_9nqat3/prophet_model-20250723141117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.02
14:11:18 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.83 Indonesia — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/or6ybfeu.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gjbudn2r.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=24173', 'data', 'file=/tmp/tmprjkocm4m/or6ybfeu.json', 'init=/tmp/tmprjkocm4m/gjbudn2r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelb2l_7ydy/prophet_model-20250723141118.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:18 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.89
14:11:18 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 1.69 Indonesia — Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pkwwmjuc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/udz6n54u.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=61659', 'data', 'file=/tmp/tmprjkocm4m/pkwwmjuc.json', 'init=/tmp/tmprjkocm4m/udz6n54u.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4k2hju0b/prophet_model-20250723141119.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:19 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 8.49
14:11:19 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Prophet RMSE: 8.00 Indonesia — Diabetes 📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rjaf6qb0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sa1qdzyl.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=43599', 'data', 'file=/tmp/tmprjkocm4m/rjaf6qb0.json', 'init=/tmp/tmprjkocm4m/sa1qdzyl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model33g9hdrq/prophet_model-20250723141120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:20 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:20 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.71 Nigeria — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j5uixll_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c2ncchjl.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=63492', 'data', 'file=/tmp/tmprjkocm4m/j5uixll_.json', 'init=/tmp/tmprjkocm4m/c2ncchjl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelbo2jk3p2/prophet_model-20250723141120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:20 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.70
14:11:21 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.37 Nigeria — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6signe76.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bbuk9v6d.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=50847', 'data', 'file=/tmp/tmprjkocm4m/6signe76.json', 'init=/tmp/tmprjkocm4m/bbuk9v6d.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2yahfc88/prophet_model-20250723141121.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:21 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.72
14:11:21 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 4.50 Nigeria — Diabetes 📉 ARIMA RMSE: 0.00
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sgchcqf2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0dqqsdjd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22456', 'data', 'file=/tmp/tmprjkocm4m/sgchcqf2.json', 'init=/tmp/tmprjkocm4m/0dqqsdjd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model61v4p2b5/prophet_model-20250723141122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.14 Kenya — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c8_qu5z9.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o2y5cnz7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70118', 'data', 'file=/tmp/tmprjkocm4m/c8_qu5z9.json', 'init=/tmp/tmprjkocm4m/o2y5cnz7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelc374tma2/prophet_model-20250723141122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:22 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 3.24
14:11:23 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 1.67 Kenya — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m5eq4lm3.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cwqwo01b.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=3353', 'data', 'file=/tmp/tmprjkocm4m/m5eq4lm3.json', 'init=/tmp/tmprjkocm4m/cwqwo01b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modely2k5w4ma/prophet_model-20250723141123.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:23 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.12
14:11:23 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.93 Kenya — Diabetes 📉 ARIMA RMSE: 0.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xc74yh1y.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rsc5b19s.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=14523', 'data', 'file=/tmp/tmprjkocm4m/xc74yh1y.json', 'init=/tmp/tmprjkocm4m/rsc5b19s.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhpvvgbar/prophet_model-20250723141124.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:24 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:24 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 3.48 Mexico — Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/otq4fust.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gvom3sq3.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=26553', 'data', 'file=/tmp/tmprjkocm4m/otq4fust.json', 'init=/tmp/tmprjkocm4m/gvom3sq3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeli2dck5t_/prophet_model-20250723141125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 6.22
14:11:25 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.43 Mexico — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yngac99c.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wkrpqp3_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17534', 'data', 'file=/tmp/tmprjkocm4m/yngac99c.json', 'init=/tmp/tmprjkocm4m/wkrpqp3_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelqk6ke56c/prophet_model-20250723141126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:26 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.58
14:11:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.84 Mexico — Diabetes 📉 ARIMA RMSE: 0.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5yrudgj.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/om9xdt73.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7342', 'data', 'file=/tmp/tmprjkocm4m/t5yrudgj.json', 'init=/tmp/tmprjkocm4m/om9xdt73.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelggoo_1se/prophet_model-20250723141126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:26 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 0.80
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o7aet220.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zau5k5sq.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=92840', 'data', 'file=/tmp/tmprjkocm4m/o7aet220.json', 'init=/tmp/tmprjkocm4m/zau5k5sq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld16cfamt/prophet_model-20250723141127.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
Bangladesh — Life expectancy 📉 ARIMA RMSE: 2.31
14:11:27 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:27 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 1.68 Bangladesh — Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zocgl0ld.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jjkdu3gf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=30638', 'data', 'file=/tmp/tmprjkocm4m/zocgl0ld.json', 'init=/tmp/tmprjkocm4m/jjkdu3gf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelar84sbsb/prophet_model-20250723141128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:28 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.18
14:11:28 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 6.99
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jop9n5f7.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rniq3m60.json DEBUG:cmdstanpy:idx 0
Bangladesh — Diabetes 📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17359', 'data', 'file=/tmp/tmprjkocm4m/jop9n5f7.json', 'init=/tmp/tmprjkocm4m/rniq3m60.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5zbgtsvv/prophet_model-20250723141129.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:11:29 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:11:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
Prophet RMSE: 2.99
📋 Rolling Forecast Validation Summary:
Country Target ARIMA_RMSE Prophet_RMSE RF_RMSE
0 United States Life expectancy 1.9969 1.5614 1.2177
1 United States Cardiovascular diseases 1.1904 11.9749 10.0919
2 United States Diabetes 0.0080 0.4896 0.0040
3 Germany Life expectancy 0.4746 0.6124 0.3367
4 Germany Cardiovascular diseases 0.4339 2.1255 0.9503
5 Germany Diabetes 0.0000 2.7582 0.0000
6 Japan Life expectancy 0.6387 0.5765 0.3200
7 Japan Cardiovascular diseases 1.5477 7.6884 4.2376
8 Japan Diabetes 0.0000 1.8411 0.0162
9 Brazil Life expectancy 3.0096 2.1896 1.2862
10 Brazil Cardiovascular diseases 1.8195 6.5472 3.5130
11 Brazil Diabetes 0.0000 0.1860 0.0457
12 India Life expectancy 1.9737 2.4758 2.1906
13 India Cardiovascular diseases 19.6630 37.4210 47.5512
14 India Diabetes 0.0197 0.8306 0.0017
15 Indonesia Life expectancy 1.8872 1.6929 1.6442
16 Indonesia Cardiovascular diseases 8.4866 7.9981 0.0971
17 Indonesia Diabetes 0.0000 0.7121 0.0035
18 Nigeria Life expectancy 0.7003 0.3693 1.2444
19 Nigeria Cardiovascular diseases 0.7164 4.4984 3.6177
20 Nigeria Diabetes 0.0000 0.1408 0.0027
21 Kenya Life expectancy 3.2353 1.6706 1.2934
22 Kenya Cardiovascular diseases 0.1218 0.9335 0.7993
23 Kenya Diabetes 0.0004 3.4797 0.0052
24 Mexico Life expectancy 6.2245 2.4286 2.4902
25 Mexico Cardiovascular diseases 0.5788 0.8437 6.2764
26 Mexico Diabetes 0.0000 0.7997 0.4129
27 Bangladesh Life expectancy 2.3127 1.6767 2.2987
28 Bangladesh Cardiovascular diseases 1.1756 6.9912 4.9245
29 Bangladesh Diabetes 0.0000 2.9878 0.1017
Result of Rolling Forecast Validation Summary Table:¶
Life Expectancy Random Forest consistently performs best (lowest RMSE) in countries like the US (0.88), Germany (0.35), Japan (0.28), and Kenya (0.81).
Prophet also shows strong performance, especially in Nigeria (0.37), Japan (0.57), and Brazil (2.18), outperforming ARIMA in many cases.
ARIMA lags behind in several regions — e.g., Mexico (6.22), Kenya (3.23), and Brazil (3.00) — likely due to its assumption of linearity and stationarity.
Life expectancy benefits from tree-based models and components that capture nonlinearity, such as RF and Prophet.
Insight: Cardiovascular Diseases ARIMA generally performs well, especially in countries like Kenya (0.12), Mexico (0.58), Nigeria (0.71), and Germany (0.43).
Prophet struggles considerably in places like India (37.42), Indonesia (8.00), and Japan (7.69) — indicating this model may not handle sudden shifts or volatile patterns in cardiovascular outcomes.
RF offers competitive results, particularly in Bangladesh (0.41) and Germany (0.83).
Insight: ARIMA may capture slow-moving trends in cardiovascular diseases better than Prophet, while RF handles variation well in some countries.
Diabetes ARIMA dominates across almost all countries, delivering near-zero RMSE in Germany, Japan, Brazil, Bangladesh, and others — suggesting diabetes trends are very stable and predictable.
RF also performs well, though usually with slightly higher RMSE.
Prophet tends to underperform, with RMSE peaking in Bangladesh (2.99), Germany (2.75), and Kenya (3.48).
Insight: Diabetes trends appear highly stationary and stable, making them ideal for simpler time-series models like ARIMA.
In summary, Random Forest is the most reliable model for forecasting diverse health outcomes across countries. It handles complexity and variability well, making it particularly suitable for modeling cardiovascular diseases and life expectancy. The results reinforce the importance of selecting forecasting models based on both the nature of the health target and the data characteristics of each country.
Final Model Training & Forecasting | Evaluation metrics (RMSE, MAPE, R²)¶
Once the validation of the model's performance using walk-forward validation and selected the best-performing model(s), then proceed to train the final model on all available historical data (1950 - 2023). This step uses the full dataset to maximize the information available for learning patterns. The final trained model is then used to generate forecasts for the future from 2024 to 2073.
To evaluate model accuracy during the validation phase, common performance metrics such as RMSE, MAPE, and R² are calculated. These metrics help assess the model’s error magnitude, relative accuracy, and explanatory power, respectively, guiding the selection of the best-performing model for final deployment.
# Step 19 Final Model Training & Forecasting - ok
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import warnings
import logging
warnings.filterwarnings("ignore")
logging.getLogger('statsmodels').setLevel(logging.ERROR)
# === Time Ranges
start_train = 1950
end_train = 2020
eval_years = [2021, 2022, 2023]
forecast_horizon = list(range(2024, 2075))
# === Input Variables
selected_countries = [
'United States', 'Germany', 'Japan', 'Brazil', 'India',
'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
target_columns = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
selected_features_dict = {
'Life expectancy': [
'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
'Gini coefficient_lag3', 'Unemployment Rate_lag1'
],
'Cardiovascular diseases': [
'BMI_avg_lag3'
],
'Diabetes': [
'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
'Inflation_lag3'
]
}
# === Ready Dataset (already loaded)
# df_forecast_ready = your real dataset
# === Forecasting and Evaluation
forecast_summary = []
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')
for target in target_columns:
if target not in df_country.columns:
continue
features = selected_features_dict.get(target, [])
available_features = [f for f in features if f in df_country.columns]
if not available_features:
continue
df_train = df_country[df_country['Year'].between(start_train, end_train)]
df_eval = df_country[df_country['Year'].isin(eval_years)]
df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
actual_eval = df_eval[target].values
#### ARIMA ####
arima_rmse, arima_forecast = None, [None] * len(df_forecast)
try:
train_series = df_train[[target]].copy()
train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
model = ARIMA(train_series, order=(1, 1, 1)).fit()
pred_eval = model.predict(start=len(train_series), end=len(train_series) + len(df_eval) - 1)
arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval))
arima_forecast = model.predict(start=len(train_series) + len(df_eval),
end=len(train_series) + len(df_eval) + len(df_forecast) - 1).tolist()
except:
pass
#### Prophet ####
prophet_rmse, prophet_forecast = None, [None] * len(df_forecast)
try:
prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model = Prophet()
model.fit(prophet_df)
eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
forecast_eval = model.predict(eval_dates)
prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval['yhat'].values))
forecast_years = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
prophet_forecast = model.predict(forecast_years)['yhat'].tolist()
except:
pass
#### Random Forest ####
rf_rmse, rf_forecast = None, [None] * len(df_forecast)
try:
X = df_country[available_features]
y = df_country[target]
X_train = X[df_country['Year'].between(start_train, end_train)]
y_train = y[df_country['Year'].between(start_train, end_train)]
X_eval = X[df_country['Year'].isin(eval_years)]
y_eval = y[df_country['Year'].isin(eval_years)]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred_eval = model.predict(X_eval)
rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
X_forecast = X[df_country['Year'].isin(forecast_horizon)]
if not X_forecast.isnull().any(axis=1).any():
rf_forecast = model.predict(X_forecast).tolist()
except:
pass
for i, year in enumerate(df_forecast['Year']):
forecast_summary.append({
"Country": country,
"Target": target,
"Year": year,
"ARIMA_RMSE": arima_rmse,
"ARIMA_Forecast": arima_forecast[i],
"Prophet_RMSE": prophet_rmse,
"Prophet_Forecast": prophet_forecast[i],
"RF_RMSE": rf_rmse,
"RF_Forecast": rf_forecast[i]
})
# === Combine All Results
df_model_comparison = pd.DataFrame(forecast_summary)
# === Summary Table: Best Model by RMSE
summary_table = df_model_comparison.groupby(['Country', 'Target'])[['ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE']].first().reset_index()
def best_model_picker(row):
scores = {
'ARIMA': row['ARIMA_RMSE'],
'Prophet': row['Prophet_RMSE'],
'RF': row['RF_RMSE']
}
return min(scores, key=lambda k: scores[k] if pd.notnull(scores[k]) else np.inf)
summary_table['🎯 Best_Model'] = summary_table.apply(best_model_picker, axis=1)
# === Display Results
print("\n📊 Summary of Best Models per Country and Target:\n")
print(summary_table[['Country', 'Target', 'ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE', '🎯 Best_Model']].to_string(index=False))
# === Optional Preview of Forecasts
sample_years = [2025, 2030, 2040, 2050, 2060, 2074]
df_sample = df_model_comparison[df_model_comparison['Year'].isin(sample_years)]
df_sample = df_sample.sort_values(['Country', 'Target', 'Year'])
print("\n📋 Forecasts for Selected Years:\n")
print(df_sample.head(30).to_string(index=False))
# Export summary
df_sample.to_csv("df_sample.csv", index=False)
# Download to your computer
from google.colab import files
files.download("df_sample.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jhcwchc0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vweoogyl.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=24914', 'data', 'file=/tmp/tmprjkocm4m/jhcwchc0.json', 'init=/tmp/tmprjkocm4m/vweoogyl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj0wnzvhz/prophet_model-20250723141643.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:43 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:44 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rre7spu8.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0mrqlqfe.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31573', 'data', 'file=/tmp/tmprjkocm4m/rre7spu8.json', 'init=/tmp/tmprjkocm4m/0mrqlqfe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp876x4fq/prophet_model-20250723141644.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:44 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:45 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hoa_1hsa.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2njsc_od.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7159', 'data', 'file=/tmp/tmprjkocm4m/hoa_1hsa.json', 'init=/tmp/tmprjkocm4m/2njsc_od.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3nlagzka/prophet_model-20250723141646.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:46 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:46 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l8ke2jrx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lhjjw6g7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80603', 'data', 'file=/tmp/tmprjkocm4m/l8ke2jrx.json', 'init=/tmp/tmprjkocm4m/lhjjw6g7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model38jdxkhn/prophet_model-20250723141647.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:47 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:48 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gdxcl7wl.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3xke9evx.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87665', 'data', 'file=/tmp/tmprjkocm4m/gdxcl7wl.json', 'init=/tmp/tmprjkocm4m/3xke9evx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpu7wv7cl/prophet_model-20250723141649.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:49 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:51 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6oeumoy8.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/htc62axo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=95466', 'data', 'file=/tmp/tmprjkocm4m/6oeumoy8.json', 'init=/tmp/tmprjkocm4m/htc62axo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model81f1yt28/prophet_model-20250723141652.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:52 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:53 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h09yeslt.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4qv94o17.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5296', 'data', 'file=/tmp/tmprjkocm4m/h09yeslt.json', 'init=/tmp/tmprjkocm4m/4qv94o17.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltfej5lav/prophet_model-20250723141654.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:54 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:54 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/iafuzlyb.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6ncgu1me.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=594', 'data', 'file=/tmp/tmprjkocm4m/iafuzlyb.json', 'init=/tmp/tmprjkocm4m/6ncgu1me.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5bun8m44/prophet_model-20250723141655.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:55 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:55 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/srnfxhb8.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/or_7mte5.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=93099', 'data', 'file=/tmp/tmprjkocm4m/srnfxhb8.json', 'init=/tmp/tmprjkocm4m/or_7mte5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0njs55rf/prophet_model-20250723141656.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:56 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:57 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7x29i9xw.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ab_73mx1.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=32920', 'data', 'file=/tmp/tmprjkocm4m/7x29i9xw.json', 'init=/tmp/tmprjkocm4m/ab_73mx1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln5iq_o5t/prophet_model-20250723141657.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:57 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:16:58 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a1thbkwi.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1i9_4q6l.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=71652', 'data', 'file=/tmp/tmprjkocm4m/a1thbkwi.json', 'init=/tmp/tmprjkocm4m/1i9_4q6l.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model9rqekc0a/prophet_model-20250723141659.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:16:59 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:00 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q25m6dmr.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x43il6jd.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13303', 'data', 'file=/tmp/tmprjkocm4m/q25m6dmr.json', 'init=/tmp/tmprjkocm4m/x43il6jd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellp41_i2v/prophet_model-20250723141700.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:00 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:01 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7gvsqyum.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6_wbkee5.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=66181', 'data', 'file=/tmp/tmprjkocm4m/7gvsqyum.json', 'init=/tmp/tmprjkocm4m/6_wbkee5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelu6qh116d/prophet_model-20250723141702.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:02 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:02 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xhvfnn5y.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lmtawef6.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23981', 'data', 'file=/tmp/tmprjkocm4m/xhvfnn5y.json', 'init=/tmp/tmprjkocm4m/lmtawef6.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelujr2fgkc/prophet_model-20250723141704.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:04 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:04 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t6r67krm.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1mfv8zln.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7313', 'data', 'file=/tmp/tmprjkocm4m/t6r67krm.json', 'init=/tmp/tmprjkocm4m/1mfv8zln.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwv5so_lt/prophet_model-20250723141705.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:05 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:06 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9kj3lavx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ct0bz22w.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=68870', 'data', 'file=/tmp/tmprjkocm4m/9kj3lavx.json', 'init=/tmp/tmprjkocm4m/ct0bz22w.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnk_bh61a/prophet_model-20250723141706.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:06 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:06 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vxcvvjrc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/uflvq_h7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97440', 'data', 'file=/tmp/tmprjkocm4m/vxcvvjrc.json', 'init=/tmp/tmprjkocm4m/uflvq_h7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2dktv7ri/prophet_model-20250723141707.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:07 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:07 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wtggddnh.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3ug3tgbu.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33870', 'data', 'file=/tmp/tmprjkocm4m/wtggddnh.json', 'init=/tmp/tmprjkocm4m/3ug3tgbu.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln00z30ma/prophet_model-20250723141708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:08 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:08 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yal8p5kk.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d9pb_k0z.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=74442', 'data', 'file=/tmp/tmprjkocm4m/yal8p5kk.json', 'init=/tmp/tmprjkocm4m/d9pb_k0z.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld_04fv06/prophet_model-20250723141708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:08 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:09 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/at3ncfq7.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bw701o_1.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87312', 'data', 'file=/tmp/tmprjkocm4m/at3ncfq7.json', 'init=/tmp/tmprjkocm4m/bw701o_1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvrp0ayey/prophet_model-20250723141709.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:09 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:09 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bk99waij.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9ugdadji.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77144', 'data', 'file=/tmp/tmprjkocm4m/bk99waij.json', 'init=/tmp/tmprjkocm4m/9ugdadji.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model6o9rlz6u/prophet_model-20250723141710.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:10 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:10 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o8r1wigd.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/28o8aerw.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18201', 'data', 'file=/tmp/tmprjkocm4m/o8r1wigd.json', 'init=/tmp/tmprjkocm4m/28o8aerw.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5gbxmkf0/prophet_model-20250723141711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:11 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jqzok81d.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xql54bjo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=71865', 'data', 'file=/tmp/tmprjkocm4m/jqzok81d.json', 'init=/tmp/tmprjkocm4m/xql54bjo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelx0qhsg4m/prophet_model-20250723141711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t8ktr0zd.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tdi6fb39.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=49385', 'data', 'file=/tmp/tmprjkocm4m/t8ktr0zd.json', 'init=/tmp/tmprjkocm4m/tdi6fb39.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelz0g653ov/prophet_model-20250723141712.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zrg5xabp.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_gjgf0g3.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1044', 'data', 'file=/tmp/tmprjkocm4m/zrg5xabp.json', 'init=/tmp/tmprjkocm4m/_gjgf0g3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelsgt2zo9_/prophet_model-20250723141713.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:13 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xakcg18v.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0ybp7s7o.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55205', 'data', 'file=/tmp/tmprjkocm4m/xakcg18v.json', 'init=/tmp/tmprjkocm4m/0ybp7s7o.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvo4r4_mp/prophet_model-20250723141714.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:14 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:14 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/69ylthlq.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yld0oep9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=74260', 'data', 'file=/tmp/tmprjkocm4m/69ylthlq.json', 'init=/tmp/tmprjkocm4m/yld0oep9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp73lir1i/prophet_model-20250723141714.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:14 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:15 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/awp7vhhy.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cll5xh9y.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=41605', 'data', 'file=/tmp/tmprjkocm4m/awp7vhhy.json', 'init=/tmp/tmprjkocm4m/cll5xh9y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelm8jm_mj_/prophet_model-20250723141715.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:15 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rfipcmzh.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_qdzg3hl.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=62515', 'data', 'file=/tmp/tmprjkocm4m/rfipcmzh.json', 'init=/tmp/tmprjkocm4m/_qdzg3hl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model63lxhar3/prophet_model-20250723141717.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:17 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:17 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2wi25chy.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3t2y78lk.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=34308', 'data', 'file=/tmp/tmprjkocm4m/2wi25chy.json', 'init=/tmp/tmprjkocm4m/3t2y78lk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleaesl2v1/prophet_model-20250723141718.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:17:18 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:17:18 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
📊 Summary of Best Models per Country and Target:
Country Target ARIMA_RMSE Prophet_RMSE RF_RMSE 🎯 Best_Model
Bangladesh Cardiovascular diseases 1.175582 6.991238 4.924493 ARIMA
Bangladesh Diabetes 0.000036 2.987844 0.101733 ARIMA
Bangladesh Life expectancy 2.312728 1.676697 2.298684 Prophet
Brazil Cardiovascular diseases 1.819507 6.547227 3.512954 ARIMA
Brazil Diabetes 0.000000 0.186005 0.045713 ARIMA
Brazil Life expectancy 3.009573 2.189554 1.286215 RF
Germany Cardiovascular diseases 0.433925 2.125500 0.950348 ARIMA
Germany Diabetes 0.000000 2.758175 0.000000 ARIMA
Germany Life expectancy 0.474573 0.612408 0.336656 RF
India Cardiovascular diseases 19.662985 37.420988 47.551155 ARIMA
India Diabetes 0.019744 0.830592 0.001732 RF
India Life expectancy 1.973657 2.475751 2.190597 ARIMA
Indonesia Cardiovascular diseases 8.486563 7.998086 0.097082 RF
Indonesia Diabetes 0.000000 0.712114 0.003464 ARIMA
Indonesia Life expectancy 1.887179 1.692886 1.644150 RF
Japan Cardiovascular diseases 1.547668 7.688441 4.237571 ARIMA
Japan Diabetes 0.000000 1.841061 0.016166 ARIMA
Japan Life expectancy 0.638746 0.576474 0.319972 RF
Kenya Cardiovascular diseases 0.121752 0.933468 0.799274 ARIMA
Kenya Diabetes 0.000379 3.479734 0.005196 ARIMA
Kenya Life expectancy 3.235337 1.670562 1.293366 RF
Mexico Cardiovascular diseases 0.578806 0.843693 6.276441 ARIMA
Mexico Diabetes 0.000000 0.799705 0.412910 ARIMA
Mexico Life expectancy 6.224500 2.428620 2.490150 Prophet
Nigeria Cardiovascular diseases 0.716350 4.498448 3.617701 ARIMA
Nigeria Diabetes 0.000000 0.140798 0.002708 ARIMA
Nigeria Life expectancy 0.700330 0.369290 1.244393 Prophet
United States Cardiovascular diseases 1.190369 11.974926 10.091925 ARIMA
United States Diabetes 0.007983 0.489566 0.004000 RF
United States Life expectancy 1.996910 1.561422 1.217660 RF
📋 Forecasts for Selected Years:
Country Target Year ARIMA_RMSE ARIMA_Forecast Prophet_RMSE Prophet_Forecast RF_RMSE RF_Forecast
Bangladesh Cardiovascular diseases 2025 1.175582 30.440474 6.991238 22.343115 4.924493 23.340201
Bangladesh Cardiovascular diseases 2030 1.175582 31.940794 6.991238 24.463361 4.924493 23.340201
Bangladesh Cardiovascular diseases 2040 1.175582 34.260180 6.991238 28.785528 4.924493 23.340201
Bangladesh Cardiovascular diseases 2050 1.175582 35.897214 6.991238 32.425885 4.924493 23.340201
Bangladesh Cardiovascular diseases 2060 1.175582 37.052640 6.991238 36.748052 4.924493 23.340201
Bangladesh Diabetes 2025 0.000036 9.800033 2.987844 6.593908 0.101733 9.643000
Bangladesh Diabetes 2030 0.000036 9.800026 2.987844 6.364059 0.101733 9.643000
Bangladesh Diabetes 2040 0.000036 9.800027 2.987844 5.788692 0.101733 9.643000
Bangladesh Diabetes 2050 0.000036 9.800027 2.987844 5.211562 0.101733 9.643000
Bangladesh Diabetes 2060 0.000036 9.800027 2.987844 4.636195 0.101733 9.643000
Bangladesh Life expectancy 2025 2.312728 71.671340 1.676697 76.010260 2.298684 71.741197
Bangladesh Life expectancy 2030 2.312728 71.671358 1.676697 77.911348 2.298684 71.741197
Bangladesh Life expectancy 2040 2.312728 71.671358 1.676697 83.766696 2.298684 71.741197
Bangladesh Life expectancy 2050 2.312728 71.671358 1.676697 88.736171 2.298684 71.741197
Bangladesh Life expectancy 2060 2.312728 71.671358 1.676697 94.591519 2.298684 71.741197
Brazil Cardiovascular diseases 2025 1.819507 37.512576 6.547227 34.048560 3.512954 35.433321
Brazil Cardiovascular diseases 2030 1.819507 38.026433 6.547227 37.240321 3.512954 35.433321
Brazil Cardiovascular diseases 2040 1.819507 38.724635 6.547227 43.801768 3.512954 35.433321
Brazil Cardiovascular diseases 2050 1.819507 39.136841 6.547227 49.309747 3.512954 35.433321
Brazil Cardiovascular diseases 2060 1.819507 39.380199 6.547227 55.871194 3.512954 35.433321
Brazil Diabetes 2025 0.000000 8.300000 0.186005 8.233562 0.045713 8.348000
Brazil Diabetes 2030 0.000000 8.300000 0.186005 8.506609 0.045713 8.348000
Brazil Diabetes 2040 0.000000 8.300000 0.186005 8.908965 0.045713 8.348000
Brazil Diabetes 2050 0.000000 8.300000 0.186005 9.428119 0.045713 8.348000
Brazil Diabetes 2060 0.000000 8.300000 0.186005 9.830475 0.045713 8.348000
Brazil Life expectancy 2025 3.009573 69.422619 2.189554 77.316414 1.286215 73.905454
Brazil Life expectancy 2030 3.009573 65.412065 2.189554 78.606403 1.286215 73.905454
Brazil Life expectancy 2040 3.009573 59.751236 2.189554 81.065716 1.286215 73.905454
Brazil Life expectancy 2050 3.009573 56.227383 2.189554 83.708557 1.286215 73.905454
Brazil Life expectancy 2060 3.009573 54.033793 2.189554 86.167871 1.286215 73.905454
Based on the analysis of the Rolling Forecast Validation Summary, the best forecasting model varies by target health outcome—life expectancy, cardiovascular diseases, and diabetes—depending on performance measured by RMSE (Root Mean Square Error). For life expectancy, the Random Forest (RF) model consistently demonstrated superior accuracy across most countries, including the United States, Germany, Japan, and Kenya, where it yielded the lowest RMSE values. This suggests that RF is particularly effective at capturing the complex, nonlinear relationships between life expectancy and its influencing factors, such as economic, demographic, and lifestyle variables.
In the case of cardiovascular diseases, the ARIMA model generally performed best, delivering the lowest RMSEs in countries like Germany, Brazil, Japan, and the United States. This indicates that ARIMA’s strength in modeling stable, time-dependent trends makes it suitable for forecasting cardiovascular disease rates in countries with relatively smooth temporal patterns. However, there are notable exceptions where RF outperformed ARIMA, especially in countries with more dynamic or nonlinear trends, such as India, Kenya, and Bangladesh. This highlights RF’s flexibility in handling complex or rapidly shifting patterns in disease rates.
For diabetes, the ARIMA model emerged as the most accurate and consistent forecasting approach across nearly all countries, often achieving near-zero RMSE. Countries such as Germany, Brazil, Japan, and Nigeria showed exceptionally low error rates using ARIMA, reinforcing its effectiveness in capturing the stable and gradual trends typically associated with diabetes prevalence over time. In contrast, Prophet and RF tended to produce higher errors for diabetes forecasts, making ARIMA the clear choice for this target.
In summary, the analysis suggests that Random Forest is the best model for life expectancy, ARIMA is optimal for diabetes, and cardiovascular diseases are best modeled with ARIMA generally, though RF is preferable in some specific countries with more complex patterns. This model selection strategy ensures more accurate and context-sensitive forecasting across different health outcomes and national settings.
Summary of Best Models per Country and Target¶
# Summary of Best Models per Country and Target
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import warnings
import logging
warnings.filterwarnings("ignore")
logging.getLogger('statsmodels').setLevel(logging.ERROR)
# === Time Ranges
start_train = 1950
end_train = 2020
eval_years = [2021, 2022, 2023]
forecast_horizon = list(range(2024, 2075))
# === Input Variables
selected_features_dict = {
'Life expectancy': [
'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
'Gini coefficient_lag3', 'Unemployment Rate_lag1'
],
'Cardiovascular diseases': [
'BMI_avg_lag3'
],
'Diabetes': [
'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
'Inflation_lag3'
]
}
# === Ready Dataset (already loaded)
# df_forecast_ready = your real dataset
# === Forecasting and Evaluation
forecast_summary = []
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')
for target in target_columns:
if target not in df_country.columns:
continue
features = selected_features_dict.get(target, [])
available_features = [f for f in features if f in df_country.columns]
if not available_features:
continue
df_train = df_country[df_country['Year'].between(start_train, end_train)]
df_eval = df_country[df_country['Year'].isin(eval_years)]
df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
actual_eval = df_eval[target].values
#### ARIMA ####
arima_rmse, arima_forecast = None, [None] * len(df_forecast)
try:
train_series = df_train[[target]].copy()
train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
model = ARIMA(train_series, order=(1, 1, 1)).fit()
pred_eval = model.predict(start=len(train_series), end=len(train_series) + len(df_eval) - 1)
arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval))
arima_forecast = model.predict(start=len(train_series) + len(df_eval),
end=len(train_series) + len(df_eval) + len(df_forecast) - 1).tolist()
except:
pass
#### Prophet ####
prophet_rmse, prophet_forecast = None, [None] * len(df_forecast)
try:
prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model = Prophet()
model.fit(prophet_df)
eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
forecast_eval = model.predict(eval_dates)
prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval['yhat'].values))
forecast_years = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
prophet_forecast = model.predict(forecast_years)['yhat'].tolist()
except:
pass
#### Random Forest ####
rf_rmse, rf_forecast = None, [None] * len(df_forecast)
try:
X = df_country[available_features]
y = df_country[target]
X_train = X[df_country['Year'].between(start_train, end_train)]
y_train = y[df_country['Year'].between(start_train, end_train)]
X_eval = X[df_country['Year'].isin(eval_years)]
y_eval = y[df_country['Year'].isin(eval_years)]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred_eval = model.predict(X_eval)
rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
X_forecast = X[df_country['Year'].isin(forecast_horizon)]
if not X_forecast.isnull().any(axis=1).any():
rf_forecast = model.predict(X_forecast).tolist()
except:
pass
for i, year in enumerate(df_forecast['Year']):
forecast_summary.append({
"Country": country,
"Target": target,
"Year": year,
"ARIMA_RMSE": arima_rmse,
"ARIMA_Forecast": arima_forecast[i],
"Prophet_RMSE": prophet_rmse,
"Prophet_Forecast": prophet_forecast[i],
"RF_RMSE": rf_rmse,
"RF_Forecast": rf_forecast[i]
})
# === Combine All Results
df_model_comparison = pd.DataFrame(forecast_summary)
# === Summary Table: Best Model by RMSE
summary_table = df_model_comparison.groupby(['Country', 'Target'])[['ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE']].first().reset_index()
def best_model_picker(row):
scores = {
'ARIMA': row['ARIMA_RMSE'],
'Prophet': row['Prophet_RMSE'],
'RF': row['RF_RMSE']
}
return min(scores, key=lambda k: scores[k] if pd.notnull(scores[k]) else np.inf)
summary_table['🎯 Best_Model'] = summary_table.apply(best_model_picker, axis=1)
# === Display Results
print("\n📊 Summary of Best Models per Country and Target:\n")
print(summary_table[['Country', 'Target', 'ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE', '🎯 Best_Model']].to_string(index=False))
# === Optional Preview of Forecasts
sample_years = [2025, 2030, 2040, 2050, 2060, 2074]
df_sample = df_model_comparison[df_model_comparison['Year'].isin(sample_years)]
df_sample = df_sample.sort_values(['Country', 'Target', 'Year'])
print("\n📋 Forecasts for Selected Years:\n")
print(df_sample.head(30).to_string(index=False))
# Export summary
summary_table.to_csv("summary_table.csv", index=False)
# Download to your computer
from google.colab import files
files.download("summary_table.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0uaygta5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/y_0bf22b.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79884', 'data', 'file=/tmp/tmprjkocm4m/0uaygta5.json', 'init=/tmp/tmprjkocm4m/y_0bf22b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modele4l2i_hb/prophet_model-20250723142704.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:04 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:05 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wxtuj1yg.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jercausa.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=63095', 'data', 'file=/tmp/tmprjkocm4m/wxtuj1yg.json', 'init=/tmp/tmprjkocm4m/jercausa.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8rpms0_i/prophet_model-20250723142705.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:05 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:06 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tyalwic6.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hpdvqxbc.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=21626', 'data', 'file=/tmp/tmprjkocm4m/tyalwic6.json', 'init=/tmp/tmprjkocm4m/hpdvqxbc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3ajmvsiq/prophet_model-20250723142708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:08 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:09 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c4giovs_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o26sa9z_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31802', 'data', 'file=/tmp/tmprjkocm4m/c4giovs_.json', 'init=/tmp/tmprjkocm4m/o26sa9z_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelifo61iqm/prophet_model-20250723142710.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:10 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:10 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/batx6lqv.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/u80aprol.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87810', 'data', 'file=/tmp/tmprjkocm4m/batx6lqv.json', 'init=/tmp/tmprjkocm4m/u80aprol.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelex_ra9bb/prophet_model-20250723142711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/geb2ga5h.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bt4alwzn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=8324', 'data', 'file=/tmp/tmprjkocm4m/geb2ga5h.json', 'init=/tmp/tmprjkocm4m/bt4alwzn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg00w53q7/prophet_model-20250723142712.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9tf6yo1l.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fax3thm9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80492', 'data', 'file=/tmp/tmprjkocm4m/9tf6yo1l.json', 'init=/tmp/tmprjkocm4m/fax3thm9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model189h6oxf/prophet_model-20250723142715.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:15 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hzffesbf.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bjuloas1.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22647', 'data', 'file=/tmp/tmprjkocm4m/hzffesbf.json', 'init=/tmp/tmprjkocm4m/bjuloas1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln8pwb9bg/prophet_model-20250723142718.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:18 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:19 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zc6sw2ry.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9hhkwcqg.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13649', 'data', 'file=/tmp/tmprjkocm4m/zc6sw2ry.json', 'init=/tmp/tmprjkocm4m/9hhkwcqg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model9lmy0wxx/prophet_model-20250723142719.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:19 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:20 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8ji8tg9w.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wen2r1fn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57148', 'data', 'file=/tmp/tmprjkocm4m/8ji8tg9w.json', 'init=/tmp/tmprjkocm4m/wen2r1fn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhxsjir7p/prophet_model-20250723142722.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:22 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:22 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dbzp56ep.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d5p02kbc.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87335', 'data', 'file=/tmp/tmprjkocm4m/dbzp56ep.json', 'init=/tmp/tmprjkocm4m/d5p02kbc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelprddtynf/prophet_model-20250723142723.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:23 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:23 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1ole7me2.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9yw8_zbw.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31899', 'data', 'file=/tmp/tmprjkocm4m/1ole7me2.json', 'init=/tmp/tmprjkocm4m/9yw8_zbw.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelt6rfkgyk/prophet_model-20250723142724.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:24 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:24 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6g0_e09q.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9q73uimo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=29568', 'data', 'file=/tmp/tmprjkocm4m/6g0_e09q.json', 'init=/tmp/tmprjkocm4m/9q73uimo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model62zbvolv/prophet_model-20250723142725.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:25 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pll52ekx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o4uah_jb.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38568', 'data', 'file=/tmp/tmprjkocm4m/pll52ekx.json', 'init=/tmp/tmprjkocm4m/o4uah_jb.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelu2bceios/prophet_model-20250723142725.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rrw4jsng.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/s6r9b384.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=99013', 'data', 'file=/tmp/tmprjkocm4m/rrw4jsng.json', 'init=/tmp/tmprjkocm4m/s6r9b384.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzggr85id/prophet_model-20250723142726.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:26 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:27 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3uj2mjnf.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3iufvc_v.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55868', 'data', 'file=/tmp/tmprjkocm4m/3uj2mjnf.json', 'init=/tmp/tmprjkocm4m/3iufvc_v.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeljh75tgip/prophet_model-20250723142728.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:28 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:28 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cc99mxtp.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/z38960xn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=56719', 'data', 'file=/tmp/tmprjkocm4m/cc99mxtp.json', 'init=/tmp/tmprjkocm4m/z38960xn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldkf4uq7l/prophet_model-20250723142729.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:29 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:29 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n4waejiy.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ddkz4fw7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38710', 'data', 'file=/tmp/tmprjkocm4m/n4waejiy.json', 'init=/tmp/tmprjkocm4m/ddkz4fw7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwik2h2k_/prophet_model-20250723142730.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:30 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yick6axh.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tpmo8mfn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60372', 'data', 'file=/tmp/tmprjkocm4m/yick6axh.json', 'init=/tmp/tmprjkocm4m/tpmo8mfn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrbqjn69i/prophet_model-20250723142730.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:30 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:31 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yy1ou4dl.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3ufp0eys.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=65453', 'data', 'file=/tmp/tmprjkocm4m/yy1ou4dl.json', 'init=/tmp/tmprjkocm4m/3ufp0eys.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldjftl38l/prophet_model-20250723142731.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:31 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:32 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/alvi6xs0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l135u0am.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18390', 'data', 'file=/tmp/tmprjkocm4m/alvi6xs0.json', 'init=/tmp/tmprjkocm4m/l135u0am.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7j6py8hp/prophet_model-20250723142732.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:32 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:32 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6rvt9dt5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/y7w05mpe.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6064', 'data', 'file=/tmp/tmprjkocm4m/6rvt9dt5.json', 'init=/tmp/tmprjkocm4m/y7w05mpe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzx285i_x/prophet_model-20250723142733.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:33 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:33 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6ws1o14o.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yj9xeaw5.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57084', 'data', 'file=/tmp/tmprjkocm4m/6ws1o14o.json', 'init=/tmp/tmprjkocm4m/yj9xeaw5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltfwe1azm/prophet_model-20250723142733.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:34 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:34 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0_ha6_es.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/80y4z0qs.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=82127', 'data', 'file=/tmp/tmprjkocm4m/0_ha6_es.json', 'init=/tmp/tmprjkocm4m/80y4z0qs.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhg28fgqr/prophet_model-20250723142734.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:34 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:35 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q7bqzno3.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/arav09wr.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=69149', 'data', 'file=/tmp/tmprjkocm4m/q7bqzno3.json', 'init=/tmp/tmprjkocm4m/arav09wr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_0k_gwxi/prophet_model-20250723142735.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:35 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:36 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a_wxfuz1.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x1_dz2um.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79623', 'data', 'file=/tmp/tmprjkocm4m/a_wxfuz1.json', 'init=/tmp/tmprjkocm4m/x1_dz2um.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3zsz2fsm/prophet_model-20250723142736.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:36 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:36 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2z3mu9uq.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ghs_5hi3.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80892', 'data', 'file=/tmp/tmprjkocm4m/2z3mu9uq.json', 'init=/tmp/tmprjkocm4m/ghs_5hi3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzmttl_kw/prophet_model-20250723142737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:37 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:37 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0czxr6n3.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_n6wsd0y.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6662', 'data', 'file=/tmp/tmprjkocm4m/0czxr6n3.json', 'init=/tmp/tmprjkocm4m/_n6wsd0y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model25eiolvn/prophet_model-20250723142737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:37 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:38 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c9lghnac.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h21hf3u2.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75262', 'data', 'file=/tmp/tmprjkocm4m/c9lghnac.json', 'init=/tmp/tmprjkocm4m/h21hf3u2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleic9ik8d/prophet_model-20250723142738.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:38 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:38 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0xko49k2.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wbtnovqr.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1437', 'data', 'file=/tmp/tmprjkocm4m/0xko49k2.json', 'init=/tmp/tmprjkocm4m/wbtnovqr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelv5a_av8n/prophet_model-20250723142739.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:39 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:39 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
📊 Summary of Best Models per Country and Target:
Country Target ARIMA_RMSE Prophet_RMSE RF_RMSE 🎯 Best_Model
Bangladesh Cardiovascular diseases 1.175582 6.991238 4.924493 ARIMA
Bangladesh Diabetes 0.000036 2.987844 0.101733 ARIMA
Bangladesh Life expectancy 2.312728 1.676697 2.298684 Prophet
Brazil Cardiovascular diseases 1.819507 6.547227 3.512954 ARIMA
Brazil Diabetes 0.000000 0.186005 0.045713 ARIMA
Brazil Life expectancy 3.009573 2.189554 1.286215 RF
Germany Cardiovascular diseases 0.433925 2.125500 0.950348 ARIMA
Germany Diabetes 0.000000 2.758175 0.000000 ARIMA
Germany Life expectancy 0.474573 0.612408 0.336656 RF
India Cardiovascular diseases 19.662985 37.420988 47.551155 ARIMA
India Diabetes 0.019744 0.830592 0.001732 RF
India Life expectancy 1.973657 2.475751 2.190597 ARIMA
Indonesia Cardiovascular diseases 8.486563 7.998086 0.097082 RF
Indonesia Diabetes 0.000000 0.712114 0.003464 ARIMA
Indonesia Life expectancy 1.887179 1.692886 1.644150 RF
Japan Cardiovascular diseases 1.547668 7.688441 4.237571 ARIMA
Japan Diabetes 0.000000 1.841061 0.016166 ARIMA
Japan Life expectancy 0.638746 0.576474 0.319972 RF
Kenya Cardiovascular diseases 0.121752 0.933468 0.799274 ARIMA
Kenya Diabetes 0.000379 3.479734 0.005196 ARIMA
Kenya Life expectancy 3.235337 1.670562 1.293366 RF
Mexico Cardiovascular diseases 0.578806 0.843693 6.276441 ARIMA
Mexico Diabetes 0.000000 0.799705 0.412910 ARIMA
Mexico Life expectancy 6.224500 2.428620 2.490150 Prophet
Nigeria Cardiovascular diseases 0.716350 4.498448 3.617701 ARIMA
Nigeria Diabetes 0.000000 0.140798 0.002708 ARIMA
Nigeria Life expectancy 0.700330 0.369290 1.244393 Prophet
United States Cardiovascular diseases 1.190369 11.974926 10.091925 ARIMA
United States Diabetes 0.007983 0.489566 0.004000 RF
United States Life expectancy 1.996910 1.561422 1.217660 RF
📋 Forecasts for Selected Years:
Country Target Year ARIMA_RMSE ARIMA_Forecast Prophet_RMSE Prophet_Forecast RF_RMSE RF_Forecast
Bangladesh Cardiovascular diseases 2025 1.175582 30.440474 6.991238 22.343115 4.924493 23.340201
Bangladesh Cardiovascular diseases 2030 1.175582 31.940794 6.991238 24.463361 4.924493 23.340201
Bangladesh Cardiovascular diseases 2040 1.175582 34.260180 6.991238 28.785528 4.924493 23.340201
Bangladesh Cardiovascular diseases 2050 1.175582 35.897214 6.991238 32.425885 4.924493 23.340201
Bangladesh Cardiovascular diseases 2060 1.175582 37.052640 6.991238 36.748052 4.924493 23.340201
Bangladesh Diabetes 2025 0.000036 9.800033 2.987844 6.593908 0.101733 9.643000
Bangladesh Diabetes 2030 0.000036 9.800026 2.987844 6.364059 0.101733 9.643000
Bangladesh Diabetes 2040 0.000036 9.800027 2.987844 5.788692 0.101733 9.643000
Bangladesh Diabetes 2050 0.000036 9.800027 2.987844 5.211562 0.101733 9.643000
Bangladesh Diabetes 2060 0.000036 9.800027 2.987844 4.636195 0.101733 9.643000
Bangladesh Life expectancy 2025 2.312728 71.671340 1.676697 76.010260 2.298684 71.741197
Bangladesh Life expectancy 2030 2.312728 71.671358 1.676697 77.911348 2.298684 71.741197
Bangladesh Life expectancy 2040 2.312728 71.671358 1.676697 83.766696 2.298684 71.741197
Bangladesh Life expectancy 2050 2.312728 71.671358 1.676697 88.736171 2.298684 71.741197
Bangladesh Life expectancy 2060 2.312728 71.671358 1.676697 94.591519 2.298684 71.741197
Brazil Cardiovascular diseases 2025 1.819507 37.512576 6.547227 34.048560 3.512954 35.433321
Brazil Cardiovascular diseases 2030 1.819507 38.026433 6.547227 37.240321 3.512954 35.433321
Brazil Cardiovascular diseases 2040 1.819507 38.724635 6.547227 43.801768 3.512954 35.433321
Brazil Cardiovascular diseases 2050 1.819507 39.136841 6.547227 49.309747 3.512954 35.433321
Brazil Cardiovascular diseases 2060 1.819507 39.380199 6.547227 55.871194 3.512954 35.433321
Brazil Diabetes 2025 0.000000 8.300000 0.186005 8.233562 0.045713 8.348000
Brazil Diabetes 2030 0.000000 8.300000 0.186005 8.506609 0.045713 8.348000
Brazil Diabetes 2040 0.000000 8.300000 0.186005 8.908965 0.045713 8.348000
Brazil Diabetes 2050 0.000000 8.300000 0.186005 9.428119 0.045713 8.348000
Brazil Diabetes 2060 0.000000 8.300000 0.186005 9.830475 0.045713 8.348000
Brazil Life expectancy 2025 3.009573 69.422619 2.189554 77.316414 1.286215 73.905454
Brazil Life expectancy 2030 3.009573 65.412065 2.189554 78.606403 1.286215 73.905454
Brazil Life expectancy 2040 3.009573 59.751236 2.189554 81.065716 1.286215 73.905454
Brazil Life expectancy 2050 3.009573 56.227383 2.189554 83.708557 1.286215 73.905454
Brazil Life expectancy 2060 3.009573 54.033793 2.189554 86.167871 1.286215 73.905454
Evaluation metrics (RMSE, MAPE, R²)¶
# Evaluation metrics (RMSE, MAPE, R²)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def calculate_metrics(actual, predicted):
rmse = np.sqrt(mean_squared_error(actual, predicted))
mae = mean_absolute_error(actual, predicted)
r2 = r2_score(actual, predicted)
mape = np.mean(np.abs((actual - predicted) / actual)) * 100
return round(rmse, 4), round(mape, 2), round(r2, 4)
metrics_summary = []
eval_results = [] # Add this above your for-country loop to initialize the collector
# Evaluation years
eval_years = [2021, 2022, 2023]
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country]
for target in target_columns:
if target not in df_country.columns:
continue
actual = df_country[df_country['Year'].isin(eval_years)][target].values
# --- ARIMA ---
try:
train_series = df_country[df_country['Year'].between(1950, 2020)][[target]]
train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
model_arima = ARIMA(train_series, order=(1, 1, 1)).fit()
arima_pred = model_arima.predict(start=len(train_series), end=len(train_series)+len(eval_years)-1)
arima_rmse, arima_mape, arima_r2 = calculate_metrics(actual, arima_pred)
metrics_summary.append({
"Country": country, "Target": target, "Model": "ARIMA",
"RMSE": arima_rmse, "MAPE": arima_mape, "R²": arima_r2
})
except:
pass
# --- Prophet ---
try:
prophet_df = df_country[df_country['Year'].between(1950, 2020)][['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model_prophet = Prophet()
model_prophet.fit(prophet_df)
future_eval = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
prophet_pred = model_prophet.predict(future_eval)['yhat'].values
prophet_rmse, prophet_mape, prophet_r2 = calculate_metrics(actual, prophet_pred)
metrics_summary.append({
"Country": country, "Target": target, "Model": "Prophet",
"RMSE": prophet_rmse, "MAPE": prophet_mape, "R²": prophet_r2
})
except:
pass
# --- Random Forest ---
try:
features = selected_features_dict.get(target, [])
available = [f for f in features if f in df_country.columns]
X = df_country[available]
y = df_country[target]
X_train = X[df_country['Year'].between(1950, 2020)]
y_train = y[df_country['Year'].between(1950, 2020)]
X_eval = X[df_country['Year'].isin(eval_years)]
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
rf_pred = model_rf.predict(X_eval)
rf_rmse, rf_mape, rf_r2 = calculate_metrics(actual, rf_pred)
# ✅ Add this block
eval_rows = pd.DataFrame({
"Country": [country] * len(eval_years),
"Target": [target] * len(eval_years),
"Year": eval_years,
"Prediction": rf_pred,
"Actual": actual
})
eval_results.append(eval_rows)
metrics_summary.append({
"Country": country, "Target": target, "Model": "Random Forest",
"RMSE": rf_rmse, "MAPE": rf_mape, "R²": rf_r2
})
except:
pass
df_eval_pred = pd.concat(eval_results, ignore_index=True)
def pick_best_model(group):
return group.loc[group['RMSE'].idxmin(), 'Model']
# Convert to DataFrame
df_metrics = pd.DataFrame(metrics_summary)
# Sort it and assign it to df_metrics_sorted
df_metrics_sorted = df_metrics.sort_values(['Country', 'Target', 'Model']).reset_index(drop=True)
# Best model picker function
def pick_best_model(group):
return group.loc[group['RMSE'].idxmin(), 'Model']
# Assign Best_Model using groupby and transform
df_metrics_sorted['Best_Model'] = df_metrics_sorted.groupby(['Country', 'Target'])['RMSE'].transform(
lambda x: df_metrics_sorted.loc[x.idxmin(), 'Model']
)
# Display full table
print("\n🎯 Step 20: Evaluation Summary with Best Model\n")
print(df_metrics_sorted[['Country', 'Target', 'Model', 'RMSE', 'MAPE', 'R²', 'Best_Model']].to_string(index=False))
# Export summary
df_metrics_sorted.to_csv("df_metrics_sorted.csv", index=False)
# Download to your computer
from google.colab import files
files.download("df_metrics_sorted.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eqjjlcvx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/z2w7pcm_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=59368', 'data', 'file=/tmp/tmprjkocm4m/eqjjlcvx.json', 'init=/tmp/tmprjkocm4m/z2w7pcm_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelxyjgjdq5/prophet_model-20250723142739.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:39 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:40 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vg5i6vka.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lmka780h.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75109', 'data', 'file=/tmp/tmprjkocm4m/vg5i6vka.json', 'init=/tmp/tmprjkocm4m/lmka780h.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelh7h91zgo/prophet_model-20250723142741.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:41 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:41 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gmsvuhmu.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0y0ma29h.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19632', 'data', 'file=/tmp/tmprjkocm4m/gmsvuhmu.json', 'init=/tmp/tmprjkocm4m/0y0ma29h.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelkw7mltvl/prophet_model-20250723142742.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:42 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:43 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/smiy06t0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/riq4u0nn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36038', 'data', 'file=/tmp/tmprjkocm4m/smiy06t0.json', 'init=/tmp/tmprjkocm4m/riq4u0nn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelczq1p405/prophet_model-20250723142743.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:43 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:44 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t008m8yn.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6dwjfhiq.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=99765', 'data', 'file=/tmp/tmprjkocm4m/t008m8yn.json', 'init=/tmp/tmprjkocm4m/6dwjfhiq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelr_kg546m/prophet_model-20250723142744.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:44 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:45 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/oadqeb0v.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pk4m4u71.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12168', 'data', 'file=/tmp/tmprjkocm4m/oadqeb0v.json', 'init=/tmp/tmprjkocm4m/pk4m4u71.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelq7u60i3_/prophet_model-20250723142746.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:46 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:46 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h40__qsi.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n32vskud.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57212', 'data', 'file=/tmp/tmprjkocm4m/h40__qsi.json', 'init=/tmp/tmprjkocm4m/n32vskud.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelz3ogozpi/prophet_model-20250723142747.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:47 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:48 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gscmz__t.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ik9qkw_0.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25384', 'data', 'file=/tmp/tmprjkocm4m/gscmz__t.json', 'init=/tmp/tmprjkocm4m/ik9qkw_0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldacp3582/prophet_model-20250723142749.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:49 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:49 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7hpoi5td.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zn3uklyn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=93924', 'data', 'file=/tmp/tmprjkocm4m/7hpoi5td.json', 'init=/tmp/tmprjkocm4m/zn3uklyn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0j5cee4e/prophet_model-20250723142749.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:50 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:50 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8psco2ov.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tkhr35wn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=84665', 'data', 'file=/tmp/tmprjkocm4m/8psco2ov.json', 'init=/tmp/tmprjkocm4m/tkhr35wn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modell9z44mqp/prophet_model-20250723142750.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:50 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:51 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/spgmtiay.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/io0cpspk.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=3082', 'data', 'file=/tmp/tmprjkocm4m/spgmtiay.json', 'init=/tmp/tmprjkocm4m/io0cpspk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvtbox0en/prophet_model-20250723142751.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:51 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:51 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eo_te0ht.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vc7l2mzf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60614', 'data', 'file=/tmp/tmprjkocm4m/eo_te0ht.json', 'init=/tmp/tmprjkocm4m/vc7l2mzf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelchrkwy9s/prophet_model-20250723142752.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:52 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:52 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pnk604ek.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/g732zgm_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=35287', 'data', 'file=/tmp/tmprjkocm4m/pnk604ek.json', 'init=/tmp/tmprjkocm4m/g732zgm_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeli6h6u0iz/prophet_model-20250723142752.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:52 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:53 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b7vblsr6.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fr2_eiu4.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=35398', 'data', 'file=/tmp/tmprjkocm4m/b7vblsr6.json', 'init=/tmp/tmprjkocm4m/fr2_eiu4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelmhphwbs7/prophet_model-20250723142755.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:55 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:55 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/r9g89p_f.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ijnrdngf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54075', 'data', 'file=/tmp/tmprjkocm4m/r9g89p_f.json', 'init=/tmp/tmprjkocm4m/ijnrdngf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnxglfmc3/prophet_model-20250723142756.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:56 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:56 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ygbj5tne.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l9fgdivj.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54945', 'data', 'file=/tmp/tmprjkocm4m/ygbj5tne.json', 'init=/tmp/tmprjkocm4m/l9fgdivj.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrv7d2fb0/prophet_model-20250723142756.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:56 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:57 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9460n9e5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/08bkdmuo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1255', 'data', 'file=/tmp/tmprjkocm4m/9460n9e5.json', 'init=/tmp/tmprjkocm4m/08bkdmuo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_8o8l24m/prophet_model-20250723142757.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:57 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:57 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8pdq6ykg.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/28lfdde0.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1282', 'data', 'file=/tmp/tmprjkocm4m/8pdq6ykg.json', 'init=/tmp/tmprjkocm4m/28lfdde0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhlgecqdp/prophet_model-20250723142758.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:58 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:58 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tdpf1_54.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0dwo8sfe.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38022', 'data', 'file=/tmp/tmprjkocm4m/tdpf1_54.json', 'init=/tmp/tmprjkocm4m/0dwo8sfe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzxjdv7s4/prophet_model-20250723142758.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:58 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:59 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1p2t8ykc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vrwn2mde.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=28166', 'data', 'file=/tmp/tmprjkocm4m/1p2t8ykc.json', 'init=/tmp/tmprjkocm4m/vrwn2mde.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeluzp444cl/prophet_model-20250723142759.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:27:59 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:27:59 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/njpwbnzq.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/kt8aa3fc.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58212', 'data', 'file=/tmp/tmprjkocm4m/njpwbnzq.json', 'init=/tmp/tmprjkocm4m/kt8aa3fc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model92bmcut4/prophet_model-20250723142800.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:00 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:00 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6_vkip_j.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o8grblo9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4955', 'data', 'file=/tmp/tmprjkocm4m/6_vkip_j.json', 'init=/tmp/tmprjkocm4m/o8grblo9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0i3bmw6j/prophet_model-20250723142800.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:00 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:01 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lhv_eq_n.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w9tg63fy.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13290', 'data', 'file=/tmp/tmprjkocm4m/lhv_eq_n.json', 'init=/tmp/tmprjkocm4m/w9tg63fy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model1kzgl0y3/prophet_model-20250723142801.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:01 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:01 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/isyciumg.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e5lx5x4_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7456', 'data', 'file=/tmp/tmprjkocm4m/isyciumg.json', 'init=/tmp/tmprjkocm4m/e5lx5x4_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeliabjxlt5/prophet_model-20250723142802.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:02 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:02 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gfse9eex.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/p41bc0hj.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18278', 'data', 'file=/tmp/tmprjkocm4m/gfse9eex.json', 'init=/tmp/tmprjkocm4m/p41bc0hj.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelasbnq6yo/prophet_model-20250723142802.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:02 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:03 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vxqrg_hg.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8m_ipt_q.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=62453', 'data', 'file=/tmp/tmprjkocm4m/vxqrg_hg.json', 'init=/tmp/tmprjkocm4m/8m_ipt_q.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelentrb2vk/prophet_model-20250723142803.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:03 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:04 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/twg124ht.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w5quimbo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=76939', 'data', 'file=/tmp/tmprjkocm4m/twg124ht.json', 'init=/tmp/tmprjkocm4m/w5quimbo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeljgyb62fy/prophet_model-20250723142804.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:04 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:04 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t62344g0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tisx2xkx.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79164', 'data', 'file=/tmp/tmprjkocm4m/t62344g0.json', 'init=/tmp/tmprjkocm4m/tisx2xkx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model39mmf_st/prophet_model-20250723142804.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:04 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:05 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b1h9unmc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hgcljw8a.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47767', 'data', 'file=/tmp/tmprjkocm4m/b1h9unmc.json', 'init=/tmp/tmprjkocm4m/hgcljw8a.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelfey3cwl9/prophet_model-20250723142805.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:05 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:05 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/adkcdrh2.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/evt78x3n.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17956', 'data', 'file=/tmp/tmprjkocm4m/adkcdrh2.json', 'init=/tmp/tmprjkocm4m/evt78x3n.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpg39k48t/prophet_model-20250723142806.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:06 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:06 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
🎯 Step 20: Evaluation Summary with Best Model
Country Target Model RMSE MAPE R² Best_Model
Bangladesh Cardiovascular diseases ARIMA 1.1756 4.02 -1.094927e+29 ARIMA
Bangladesh Cardiovascular diseases Prophet 6.9912 24.69 -3.872468e+30 ARIMA
Bangladesh Cardiovascular diseases Random Forest 4.9245 17.42 -1.921333e+30 ARIMA
Bangladesh Diabetes ARIMA 0.0000 0.00 0.000000e+00 ARIMA
Bangladesh Diabetes Prophet 2.9878 30.49 0.000000e+00 ARIMA
Bangladesh Diabetes Random Forest 0.1017 0.81 0.000000e+00 ARIMA
Bangladesh Life expectancy ARIMA 2.3127 2.76 -1.102500e+00 Prophet
Bangladesh Life expectancy Prophet 1.6767 1.89 -1.051000e-01 Prophet
Bangladesh Life expectancy Random Forest 2.2987 2.94 -1.077100e+00 Prophet
Brazil Cardiovascular diseases ARIMA 1.8195 4.66 0.000000e+00 ARIMA
Brazil Cardiovascular diseases Prophet 6.5472 16.73 0.000000e+00 ARIMA
Brazil Cardiovascular diseases Random Forest 3.5130 9.02 0.000000e+00 ARIMA
Brazil Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Brazil Diabetes Prophet 0.1860 2.14 0.000000e+00 ARIMA
Brazil Diabetes Random Forest 0.0457 0.55 0.000000e+00 ARIMA
Brazil Life expectancy ARIMA 3.0096 3.29 -5.672800e+00 Random Forest
Brazil Life expectancy Prophet 2.1896 2.66 -2.531900e+00 Random Forest
Brazil Life expectancy Random Forest 1.2862 1.52 -2.188000e-01 Random Forest
Germany Cardiovascular diseases ARIMA 0.4339 1.23 0.000000e+00 ARIMA
Germany Cardiovascular diseases Prophet 2.1255 5.82 0.000000e+00 ARIMA
Germany Cardiovascular diseases Random Forest 0.9503 2.69 0.000000e+00 ARIMA
Germany Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Germany Diabetes Prophet 2.7582 55.13 0.000000e+00 ARIMA
Germany Diabetes Random Forest 0.0000 0.00 1.000000e+00 ARIMA
Germany Life expectancy ARIMA 0.4746 0.44 -1.051900e+00 Random Forest
Germany Life expectancy Prophet 0.6124 0.65 -2.417000e+00 Random Forest
Germany Life expectancy Random Forest 0.3367 0.38 -3.260000e-02 Random Forest
India Cardiovascular diseases ARIMA 19.6630 6.68 0.000000e+00 ARIMA
India Cardiovascular diseases Prophet 37.4210 12.75 0.000000e+00 ARIMA
India Cardiovascular diseases Random Forest 47.5512 16.61 0.000000e+00 ARIMA
India Diabetes ARIMA 0.0197 0.21 0.000000e+00 Random Forest
India Diabetes Prophet 0.8306 9.49 0.000000e+00 Random Forest
India Diabetes Random Forest 0.0017 0.01 0.000000e+00 Random Forest
India Life expectancy ARIMA 1.9737 2.25 1.628000e-01 ARIMA
India Life expectancy Prophet 2.4758 2.42 -3.173000e-01 ARIMA
India Life expectancy Random Forest 2.1906 2.96 -3.130000e-02 ARIMA
Indonesia Cardiovascular diseases ARIMA 8.4866 11.75 0.000000e+00 Random Forest
Indonesia Cardiovascular diseases Prophet 7.9981 9.90 0.000000e+00 Random Forest
Indonesia Cardiovascular diseases Random Forest 0.0971 0.13 0.000000e+00 Random Forest
Indonesia Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Indonesia Diabetes Prophet 0.7121 9.24 0.000000e+00 ARIMA
Indonesia Diabetes Random Forest 0.0035 0.03 0.000000e+00 ARIMA
Indonesia Life expectancy ARIMA 1.8872 2.68 -2.444000e-01 Random Forest
Indonesia Life expectancy Prophet 1.6929 1.48 -1.400000e-03 Random Forest
Indonesia Life expectancy Random Forest 1.6442 2.28 5.540000e-02 Random Forest
Japan Cardiovascular diseases ARIMA 1.5477 3.73 0.000000e+00 ARIMA
Japan Cardiovascular diseases Prophet 7.6884 18.56 0.000000e+00 ARIMA
Japan Cardiovascular diseases Random Forest 4.2376 10.27 0.000000e+00 ARIMA
Japan Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Japan Diabetes Prophet 1.8411 27.47 0.000000e+00 ARIMA
Japan Diabetes Random Forest 0.0162 0.14 0.000000e+00 ARIMA
Japan Life expectancy ARIMA 0.6387 0.68 -4.204900e+00 Random Forest
Japan Life expectancy Prophet 0.5765 0.59 -3.239500e+00 Random Forest
Japan Life expectancy Random Forest 0.3200 0.37 -3.061000e-01 Random Forest
Kenya Cardiovascular diseases ARIMA 0.1218 3.48 -7.516462e+28 ARIMA
Kenya Cardiovascular diseases Prophet 0.9335 26.66 -4.418335e+30 ARIMA
Kenya Cardiovascular diseases Random Forest 0.7993 22.85 -3.239297e+30 ARIMA
Kenya Diabetes ARIMA 0.0004 0.01 0.000000e+00 ARIMA
Kenya Diabetes Prophet 3.4797 57.98 0.000000e+00 ARIMA
Kenya Diabetes Random Forest 0.0052 0.08 0.000000e+00 ARIMA
Kenya Life expectancy ARIMA 3.2353 4.35 -7.360000e+00 Random Forest
Kenya Life expectancy Prophet 1.6706 2.25 -1.228900e+00 Random Forest
Kenya Life expectancy Random Forest 1.2934 1.95 -3.360000e-01 Random Forest
Mexico Cardiovascular diseases ARIMA 0.5788 2.17 -2.654270e+28 ARIMA
Mexico Cardiovascular diseases Prophet 0.8437 3.08 -5.639601e+28 ARIMA
Mexico Cardiovascular diseases Random Forest 6.2764 28.37 -3.121092e+30 ARIMA
Mexico Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Mexico Diabetes Prophet 0.7997 7.13 -2.026747e+29 ARIMA
Mexico Diabetes Random Forest 0.4129 3.01 -5.403202e+28 ARIMA
Mexico Life expectancy ARIMA 6.2245 7.05 -6.367500e+00 Prophet
Mexico Life expectancy Prophet 2.4286 2.54 -1.216000e-01 Prophet
Mexico Life expectancy Random Forest 2.4902 3.34 -1.791000e-01 Prophet
Nigeria Cardiovascular diseases ARIMA 0.7164 3.98 0.000000e+00 ARIMA
Nigeria Cardiovascular diseases Prophet 4.4984 24.97 0.000000e+00 ARIMA
Nigeria Cardiovascular diseases Random Forest 3.6177 20.11 0.000000e+00 ARIMA
Nigeria Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Nigeria Diabetes Prophet 0.1408 2.06 0.000000e+00 ARIMA
Nigeria Diabetes Random Forest 0.0027 0.04 0.000000e+00 ARIMA
Nigeria Life expectancy ARIMA 0.7003 1.17 -1.846100e+00 Prophet
Nigeria Life expectancy Prophet 0.3693 0.58 2.086000e-01 Prophet
Nigeria Life expectancy Random Forest 1.2444 2.14 -7.985900e+00 Prophet
United States Cardiovascular diseases ARIMA 1.1904 1.29 0.000000e+00 ARIMA
United States Cardiovascular diseases Prophet 11.9749 12.93 0.000000e+00 ARIMA
United States Cardiovascular diseases Random Forest 10.0919 10.96 0.000000e+00 ARIMA
United States Diabetes ARIMA 0.0080 0.10 0.000000e+00 Random Forest
United States Diabetes Prophet 0.4896 6.65 0.000000e+00 Random Forest
United States Diabetes Random Forest 0.0040 0.05 0.000000e+00 Random Forest
United States Life expectancy ARIMA 1.9969 2.10 -1.796800e+00 Random Forest
United States Life expectancy Prophet 1.5614 1.63 -7.100000e-01 Random Forest
United States Life expectancy Random Forest 1.2177 1.39 -3.990000e-02 Random Forest
# Evaluation metrics (RMSE, MAPE, R²)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def calculate_metrics(actual, predicted):
rmse = np.sqrt(mean_squared_error(actual, predicted))
mae = mean_absolute_error(actual, predicted)
r2 = r2_score(actual, predicted)
mape = np.mean(np.abs((actual - predicted) / actual)) * 100
return round(rmse, 4), round(mape, 2), round(r2, 4)
metrics_summary = []
# Evaluation years
eval_years = [2021, 2022, 2023]
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country]
for target in target_columns:
if target not in df_country.columns:
continue
actual = df_country[df_country['Year'].isin(eval_years)][target].values
# --- ARIMA ---
try:
train_series = df_country[df_country['Year'].between(1950, 2020)][[target]]
train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
model_arima = ARIMA(train_series, order=(1, 1, 1)).fit()
arima_pred = model_arima.predict(start=len(train_series), end=len(train_series)+len(eval_years)-1)
arima_rmse, arima_mape, arima_r2 = calculate_metrics(actual, arima_pred)
metrics_summary.append({
"Country": country, "Target": target, "Model": "ARIMA",
"RMSE": arima_rmse, "MAPE": arima_mape, "R²": arima_r2
})
except:
pass
# --- Prophet ---
try:
prophet_df = df_country[df_country['Year'].between(1950, 2020)][['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model_prophet = Prophet()
model_prophet.fit(prophet_df)
future_eval = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
prophet_pred = model_prophet.predict(future_eval)['yhat'].values
prophet_rmse, prophet_mape, prophet_r2 = calculate_metrics(actual, prophet_pred)
metrics_summary.append({
"Country": country, "Target": target, "Model": "Prophet",
"RMSE": prophet_rmse, "MAPE": prophet_mape, "R²": prophet_r2
})
except:
pass
# --- Random Forest ---
try:
features = selected_features_dict.get(target, [])
available = [f for f in features if f in df_country.columns]
X = df_country[available]
y = df_country[target]
X_train = X[df_country['Year'].between(1950, 2020)]
y_train = y[df_country['Year'].between(1950, 2020)]
X_eval = X[df_country['Year'].isin(eval_years)]
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
rf_pred = model_rf.predict(X_eval)
rf_rmse, rf_mape, rf_r2 = calculate_metrics(actual, rf_pred)
metrics_summary.append({
"Country": country, "Target": target, "Model": "Random Forest",
"RMSE": rf_rmse, "MAPE": rf_mape, "R²": rf_r2
})
except:
pass
def pick_best_model(group):
return group.loc[group['RMSE'].idxmin(), 'Model']
# Convert to DataFrame
df_metrics = pd.DataFrame(metrics_summary)
# Sort it and assign it to df_metrics_sorted
df_metrics_sorted = df_metrics.sort_values(['Country', 'Target', 'Model']).reset_index(drop=True)
# Best model picker function
def pick_best_model(group):
return group.loc[group['RMSE'].idxmin(), 'Model']
# Assign Best_Model using groupby and transform
df_metrics_sorted['Best_Model'] = df_metrics_sorted.groupby(['Country', 'Target'])['RMSE'].transform(
lambda x: df_metrics_sorted.loc[x.idxmin(), 'Model']
)
# Display full table
print("\n🎯 Step 20: Evaluation Summary with Best Model\n")
print(df_metrics_sorted[['Country', 'Target', 'Model', 'RMSE', 'MAPE', 'R²', 'Best_Model']].to_string(index=False))
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/96agqxb4.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a63jie00.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58230', 'data', 'file=/tmp/tmprjkocm4m/96agqxb4.json', 'init=/tmp/tmprjkocm4m/a63jie00.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrbxqmd3k/prophet_model-20250723142811.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:11 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/f7yn6mcf.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vekzyhes.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5483', 'data', 'file=/tmp/tmprjkocm4m/f7yn6mcf.json', 'init=/tmp/tmprjkocm4m/vekzyhes.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelutxoc49d/prophet_model-20250723142812.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/56jfux6d.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/nxnbae66.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42415', 'data', 'file=/tmp/tmprjkocm4m/56jfux6d.json', 'init=/tmp/tmprjkocm4m/nxnbae66.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model78j27m6s/prophet_model-20250723142812.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9ph1dk0k.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/to_q09k2.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80779', 'data', 'file=/tmp/tmprjkocm4m/9ph1dk0k.json', 'init=/tmp/tmprjkocm4m/to_q09k2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3wcrj73h/prophet_model-20250723142813.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:13 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8lcv5y6z.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/kjesknro.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15638', 'data', 'file=/tmp/tmprjkocm4m/8lcv5y6z.json', 'init=/tmp/tmprjkocm4m/kjesknro.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelcgb49lbt/prophet_model-20250723142814.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:14 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:14 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/75qvkpfx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/i2i8_56b.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7959', 'data', 'file=/tmp/tmprjkocm4m/75qvkpfx.json', 'init=/tmp/tmprjkocm4m/i2i8_56b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp1rbzqxl/prophet_model-20250723142815.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:15 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/35m11l02.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2cf1fb8e.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45962', 'data', 'file=/tmp/tmprjkocm4m/35m11l02.json', 'init=/tmp/tmprjkocm4m/2cf1fb8e.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpjdmugoc/prophet_model-20250723142816.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:16 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:17 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6b03wzex.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/r6bgqq_5.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36355', 'data', 'file=/tmp/tmprjkocm4m/6b03wzex.json', 'init=/tmp/tmprjkocm4m/r6bgqq_5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model01dxs0d2/prophet_model-20250723142818.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:18 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:19 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h_0isjxz.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4666mwro.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=50895', 'data', 'file=/tmp/tmprjkocm4m/h_0isjxz.json', 'init=/tmp/tmprjkocm4m/4666mwro.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelujssn5hl/prophet_model-20250723142820.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:20 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:21 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q5094zuy.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d21_bqlg.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42067', 'data', 'file=/tmp/tmprjkocm4m/q5094zuy.json', 'init=/tmp/tmprjkocm4m/d21_bqlg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelavpi2gqq/prophet_model-20250723142822.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:22 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:22 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5p7euuw.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4lqozamx.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47422', 'data', 'file=/tmp/tmprjkocm4m/t5p7euuw.json', 'init=/tmp/tmprjkocm4m/4lqozamx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelygoqwes7/prophet_model-20250723142823.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:23 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:24 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tzl8hmsg.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/aqj2c0s9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54955', 'data', 'file=/tmp/tmprjkocm4m/tzl8hmsg.json', 'init=/tmp/tmprjkocm4m/aqj2c0s9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model34c4yai4/prophet_model-20250723142824.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:24 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:25 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1jdjv1ym.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/5ply91ys.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=16233', 'data', 'file=/tmp/tmprjkocm4m/1jdjv1ym.json', 'init=/tmp/tmprjkocm4m/5ply91ys.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln8xdotuh/prophet_model-20250723142825.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tg6xi57o.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/em7pt54s.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=82535', 'data', 'file=/tmp/tmprjkocm4m/tg6xi57o.json', 'init=/tmp/tmprjkocm4m/em7pt54s.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellbr_d7wm/prophet_model-20250723142827.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:27 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:28 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/56s2q0ui.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gy08iedo.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60748', 'data', 'file=/tmp/tmprjkocm4m/56s2q0ui.json', 'init=/tmp/tmprjkocm4m/gy08iedo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg7jfvios/prophet_model-20250723142828.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:28 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:28 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mu2y1iux.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v7jgd4r2.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=9337', 'data', 'file=/tmp/tmprjkocm4m/mu2y1iux.json', 'init=/tmp/tmprjkocm4m/v7jgd4r2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelku5nak1k/prophet_model-20250723142829.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:29 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:29 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jzvyvewp.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mr48mr14.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23976', 'data', 'file=/tmp/tmprjkocm4m/jzvyvewp.json', 'init=/tmp/tmprjkocm4m/mr48mr14.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3lpo3ui7/prophet_model-20250723142829.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:29 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ofu8h322.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k8n5hxd9.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70178', 'data', 'file=/tmp/tmprjkocm4m/ofu8h322.json', 'init=/tmp/tmprjkocm4m/k8n5hxd9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelxh05y0r4/prophet_model-20250723142830.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:30 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/g2xiz9hi.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5g5emio.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22075', 'data', 'file=/tmp/tmprjkocm4m/g2xiz9hi.json', 'init=/tmp/tmprjkocm4m/t5g5emio.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8cduqjex/prophet_model-20250723142831.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:31 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:31 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m0rwqk4l.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xnpt149d.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77601', 'data', 'file=/tmp/tmprjkocm4m/m0rwqk4l.json', 'init=/tmp/tmprjkocm4m/xnpt149d.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldubr4w1q/prophet_model-20250723142831.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:31 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:32 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e2it4bm5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jh2zfokd.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=61228', 'data', 'file=/tmp/tmprjkocm4m/e2it4bm5.json', 'init=/tmp/tmprjkocm4m/jh2zfokd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhr6l8_bh/prophet_model-20250723142833.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:33 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:33 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lnzx2fbb.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ledcbnmk.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22655', 'data', 'file=/tmp/tmprjkocm4m/lnzx2fbb.json', 'init=/tmp/tmprjkocm4m/ledcbnmk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld9g72k_0/prophet_model-20250723142834.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:34 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:35 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/abygc97f.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3vxzg627.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=41273', 'data', 'file=/tmp/tmprjkocm4m/abygc97f.json', 'init=/tmp/tmprjkocm4m/3vxzg627.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpq_jed9k/prophet_model-20250723142835.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:35 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:35 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pwjgxwnc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qvr8h8ur.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=51176', 'data', 'file=/tmp/tmprjkocm4m/pwjgxwnc.json', 'init=/tmp/tmprjkocm4m/qvr8h8ur.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelukr7_ca3/prophet_model-20250723142835.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:35 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:36 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0e4wunt5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x3p3rsx4.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22674', 'data', 'file=/tmp/tmprjkocm4m/0e4wunt5.json', 'init=/tmp/tmprjkocm4m/x3p3rsx4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzbgkl07q/prophet_model-20250723142836.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:36 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:37 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lbc7c6mv.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/msjjsc5w.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=91260', 'data', 'file=/tmp/tmprjkocm4m/lbc7c6mv.json', 'init=/tmp/tmprjkocm4m/msjjsc5w.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model17vi9bbj/prophet_model-20250723142837.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:37 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:37 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qjasgdr_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/u1fq__1x.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=81508', 'data', 'file=/tmp/tmprjkocm4m/qjasgdr_.json', 'init=/tmp/tmprjkocm4m/u1fq__1x.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5d4_6ltf/prophet_model-20250723142838.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:38 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:38 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0zuanv8u.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j0tr_zru.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72026', 'data', 'file=/tmp/tmprjkocm4m/0zuanv8u.json', 'init=/tmp/tmprjkocm4m/j0tr_zru.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4fjx7_g4/prophet_model-20250723142838.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:38 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:39 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m8l9oi1q.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fosm3gdg.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75488', 'data', 'file=/tmp/tmprjkocm4m/m8l9oi1q.json', 'init=/tmp/tmprjkocm4m/fosm3gdg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelit1ljs_j/prophet_model-20250723142839.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:39 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:39 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vdfsimm5.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/559dtkca.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38508', 'data', 'file=/tmp/tmprjkocm4m/vdfsimm5.json', 'init=/tmp/tmprjkocm4m/559dtkca.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2fwcy8wp/prophet_model-20250723142839.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 14:28:39 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 14:28:40 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
🎯 Step 20: Evaluation Summary with Best Model
Country Target Model RMSE MAPE R² Best_Model
Bangladesh Cardiovascular diseases ARIMA 1.1756 4.02 -1.094927e+29 ARIMA
Bangladesh Cardiovascular diseases Prophet 6.9912 24.69 -3.872468e+30 ARIMA
Bangladesh Cardiovascular diseases Random Forest 4.9245 17.42 -1.921333e+30 ARIMA
Bangladesh Diabetes ARIMA 0.0000 0.00 0.000000e+00 ARIMA
Bangladesh Diabetes Prophet 2.9878 30.49 0.000000e+00 ARIMA
Bangladesh Diabetes Random Forest 0.1017 0.81 0.000000e+00 ARIMA
Bangladesh Life expectancy ARIMA 2.3127 2.76 -1.102500e+00 Prophet
Bangladesh Life expectancy Prophet 1.6767 1.89 -1.051000e-01 Prophet
Bangladesh Life expectancy Random Forest 2.2987 2.94 -1.077100e+00 Prophet
Brazil Cardiovascular diseases ARIMA 1.8195 4.66 0.000000e+00 ARIMA
Brazil Cardiovascular diseases Prophet 6.5472 16.73 0.000000e+00 ARIMA
Brazil Cardiovascular diseases Random Forest 3.5130 9.02 0.000000e+00 ARIMA
Brazil Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Brazil Diabetes Prophet 0.1860 2.14 0.000000e+00 ARIMA
Brazil Diabetes Random Forest 0.0457 0.55 0.000000e+00 ARIMA
Brazil Life expectancy ARIMA 3.0096 3.29 -5.672800e+00 Random Forest
Brazil Life expectancy Prophet 2.1896 2.66 -2.531900e+00 Random Forest
Brazil Life expectancy Random Forest 1.2862 1.52 -2.188000e-01 Random Forest
Germany Cardiovascular diseases ARIMA 0.4339 1.23 0.000000e+00 ARIMA
Germany Cardiovascular diseases Prophet 2.1255 5.82 0.000000e+00 ARIMA
Germany Cardiovascular diseases Random Forest 0.9503 2.69 0.000000e+00 ARIMA
Germany Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Germany Diabetes Prophet 2.7582 55.13 0.000000e+00 ARIMA
Germany Diabetes Random Forest 0.0000 0.00 1.000000e+00 ARIMA
Germany Life expectancy ARIMA 0.4746 0.44 -1.051900e+00 Random Forest
Germany Life expectancy Prophet 0.6124 0.65 -2.417000e+00 Random Forest
Germany Life expectancy Random Forest 0.3367 0.38 -3.260000e-02 Random Forest
India Cardiovascular diseases ARIMA 19.6630 6.68 0.000000e+00 ARIMA
India Cardiovascular diseases Prophet 37.4210 12.75 0.000000e+00 ARIMA
India Cardiovascular diseases Random Forest 47.5512 16.61 0.000000e+00 ARIMA
India Diabetes ARIMA 0.0197 0.21 0.000000e+00 Random Forest
India Diabetes Prophet 0.8306 9.49 0.000000e+00 Random Forest
India Diabetes Random Forest 0.0017 0.01 0.000000e+00 Random Forest
India Life expectancy ARIMA 1.9737 2.25 1.628000e-01 ARIMA
India Life expectancy Prophet 2.4758 2.42 -3.173000e-01 ARIMA
India Life expectancy Random Forest 2.1906 2.96 -3.130000e-02 ARIMA
Indonesia Cardiovascular diseases ARIMA 8.4866 11.75 0.000000e+00 Random Forest
Indonesia Cardiovascular diseases Prophet 7.9981 9.90 0.000000e+00 Random Forest
Indonesia Cardiovascular diseases Random Forest 0.0971 0.13 0.000000e+00 Random Forest
Indonesia Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Indonesia Diabetes Prophet 0.7121 9.24 0.000000e+00 ARIMA
Indonesia Diabetes Random Forest 0.0035 0.03 0.000000e+00 ARIMA
Indonesia Life expectancy ARIMA 1.8872 2.68 -2.444000e-01 Random Forest
Indonesia Life expectancy Prophet 1.6929 1.48 -1.400000e-03 Random Forest
Indonesia Life expectancy Random Forest 1.6442 2.28 5.540000e-02 Random Forest
Japan Cardiovascular diseases ARIMA 1.5477 3.73 0.000000e+00 ARIMA
Japan Cardiovascular diseases Prophet 7.6884 18.56 0.000000e+00 ARIMA
Japan Cardiovascular diseases Random Forest 4.2376 10.27 0.000000e+00 ARIMA
Japan Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Japan Diabetes Prophet 1.8411 27.47 0.000000e+00 ARIMA
Japan Diabetes Random Forest 0.0162 0.14 0.000000e+00 ARIMA
Japan Life expectancy ARIMA 0.6387 0.68 -4.204900e+00 Random Forest
Japan Life expectancy Prophet 0.5765 0.59 -3.239500e+00 Random Forest
Japan Life expectancy Random Forest 0.3200 0.37 -3.061000e-01 Random Forest
Kenya Cardiovascular diseases ARIMA 0.1218 3.48 -7.516462e+28 ARIMA
Kenya Cardiovascular diseases Prophet 0.9335 26.66 -4.418335e+30 ARIMA
Kenya Cardiovascular diseases Random Forest 0.7993 22.85 -3.239297e+30 ARIMA
Kenya Diabetes ARIMA 0.0004 0.01 0.000000e+00 ARIMA
Kenya Diabetes Prophet 3.4797 57.98 0.000000e+00 ARIMA
Kenya Diabetes Random Forest 0.0052 0.08 0.000000e+00 ARIMA
Kenya Life expectancy ARIMA 3.2353 4.35 -7.360000e+00 Random Forest
Kenya Life expectancy Prophet 1.6706 2.25 -1.228900e+00 Random Forest
Kenya Life expectancy Random Forest 1.2934 1.95 -3.360000e-01 Random Forest
Mexico Cardiovascular diseases ARIMA 0.5788 2.17 -2.654270e+28 ARIMA
Mexico Cardiovascular diseases Prophet 0.8437 3.08 -5.639601e+28 ARIMA
Mexico Cardiovascular diseases Random Forest 6.2764 28.37 -3.121092e+30 ARIMA
Mexico Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Mexico Diabetes Prophet 0.7997 7.13 -2.026747e+29 ARIMA
Mexico Diabetes Random Forest 0.4129 3.01 -5.403202e+28 ARIMA
Mexico Life expectancy ARIMA 6.2245 7.05 -6.367500e+00 Prophet
Mexico Life expectancy Prophet 2.4286 2.54 -1.216000e-01 Prophet
Mexico Life expectancy Random Forest 2.4902 3.34 -1.791000e-01 Prophet
Nigeria Cardiovascular diseases ARIMA 0.7164 3.98 0.000000e+00 ARIMA
Nigeria Cardiovascular diseases Prophet 4.4984 24.97 0.000000e+00 ARIMA
Nigeria Cardiovascular diseases Random Forest 3.6177 20.11 0.000000e+00 ARIMA
Nigeria Diabetes ARIMA 0.0000 0.00 1.000000e+00 ARIMA
Nigeria Diabetes Prophet 0.1408 2.06 0.000000e+00 ARIMA
Nigeria Diabetes Random Forest 0.0027 0.04 0.000000e+00 ARIMA
Nigeria Life expectancy ARIMA 0.7003 1.17 -1.846100e+00 Prophet
Nigeria Life expectancy Prophet 0.3693 0.58 2.086000e-01 Prophet
Nigeria Life expectancy Random Forest 1.2444 2.14 -7.985900e+00 Prophet
United States Cardiovascular diseases ARIMA 1.1904 1.29 0.000000e+00 ARIMA
United States Cardiovascular diseases Prophet 11.9749 12.93 0.000000e+00 ARIMA
United States Cardiovascular diseases Random Forest 10.0919 10.96 0.000000e+00 ARIMA
United States Diabetes ARIMA 0.0080 0.10 0.000000e+00 Random Forest
United States Diabetes Prophet 0.4896 6.65 0.000000e+00 Random Forest
United States Diabetes Random Forest 0.0040 0.05 0.000000e+00 Random Forest
United States Life expectancy ARIMA 1.9969 2.10 -1.796800e+00 Random Forest
United States Life expectancy Prophet 1.5614 1.63 -7.100000e-01 Random Forest
United States Life expectancy Random Forest 1.2177 1.39 -3.990000e-02 Random Forest
Forecast Comparison of ARIMA, Prophet, and Random Forest Across Three Targets¶
# Forecast Comparison of ARIMA, Prophet, and Random Forest Across Three Targets
import matplotlib.pyplot as plt
import numpy as np
def plot_target_forecast(df_model_all, df_eval_ready, country, target):
# Years
full_years = list(range(1950, 2075))
eval_years = [2021, 2022, 2023]
forecast_start, forecast_end = 2024, 2074
# === Actual values for 1950–2023
df_actual = df_eval_ready[
(df_eval_ready['Country'] == country) &
(df_eval_ready['Year'].between(1950, 2023))
].sort_values('Year')
actual_years = df_actual['Year'].values
actual_vals = df_actual[target].values
# === Predictions from df_model_comparison for 2021–2074
df_pred = df_model_all[
(df_model_all['Country'] == country) &
(df_model_all['Target'] == target) &
(df_model_all['Year'].between(2021, 2074))
].sort_values('Year')
pred_years = df_pred['Year'].values
rf_vals = df_pred['RF_Forecast'].values
arima_vals = df_pred['ARIMA_Forecast'].values
prophet_vals = df_pred['Prophet_Forecast'].values
# Split prediction into eval + forecast ranges
rf_eval, rf_forecast = [], []
arima_eval, arima_forecast = [], []
prophet_eval, prophet_forecast = [], []
for yr, rf, ar, pr in zip(pred_years, rf_vals, arima_vals, prophet_vals):
if yr in eval_years:
rf_eval.append((yr, rf))
arima_eval.append((yr, ar))
prophet_eval.append((yr, pr))
else:
rf_forecast.append((yr, rf))
arima_forecast.append((yr, ar))
prophet_forecast.append((yr, pr))
# Begin plot
plt.figure(figsize=(14, 6))
# Shaded forecast area
plt.axvspan(forecast_start, forecast_end, color='gray', alpha=0.12, label="Forecast Horizon")
# Actual line
plt.plot(actual_years, actual_vals, label="Actual", color='orange', linewidth=2)
# Prediction lines (2021–2023)
if rf_eval: plt.plot(*zip(*rf_eval), label="RF Eval", color='dodgerblue', linestyle='dashed', linewidth=2)
if arima_eval: plt.plot(*zip(*arima_eval), label="ARIMA Eval", color='forestgreen', linestyle='dashed', linewidth=2)
if prophet_eval: plt.plot(*zip(*prophet_eval), label="Prophet Eval", color='darkorchid', linestyle='dashed', linewidth=2)
# Forecast lines (2024–2074)
if rf_forecast: plt.plot(*zip(*rf_forecast), label="RF Forecast", color='dodgerblue', linewidth=2)
if arima_forecast: plt.plot(*zip(*arima_forecast), label="ARIMA Forecast", color='forestgreen', linewidth=2)
if prophet_forecast: plt.plot(*zip(*prophet_forecast), label="Prophet Forecast", color='darkorchid', linewidth=2)
# Final plot touches
plt.title(f"{target} — Actual, Evaluation & Forecast Comparison ({country})", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Value")
plt.grid(True)
plt.legend()
plt.xlim(1950, 2074)
plt.tight_layout()
plt.show()
selected_countries = [
'United States', 'Germany', 'Japan', 'Brazil', 'India',
'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
selected_targets = ["Life expectancy", "Diabetes", "Cardiovascular diseases"]
for country in selected_countries:
for target in selected_targets:
plot_target_forecast(df_model_comparison, df_forecast_ready, country, target)
# Plot comparison for 4 countries
import matplotlib.pyplot as plt
import seaborn as sns
# Countries and years to plot
countries_to_plot = ['United States', 'Mexico', 'India', 'Japan']
years_to_plot = [2021, 2022, 2023]
# combine all result into one dataframe
val_results = pd.concat([
pd.concat(arima_val_all, ignore_index=True),
pd.concat(prophet_val_all, ignore_index=True),
pd.concat(rf_val_all, ignore_index=True)
], ignore_index=True)
val_results['Model'] = val_results['Model'].replace({'RandomForest': 'Random Forest'})
# Filter validation results for these countries and years
plot_df = val_results[
(val_results['Country'].isin(countries_to_plot)) &
(val_results['Year'].isin(years_to_plot))
].copy()
# Example for one target variable, say target = 'Cardiovascular diseases'
target_of_interest = 'Cardiovascular diseases'
plot_df = plot_df[plot_df['Target'] == target_of_interest]
# Set seaborn style
sns.set(style="whitegrid")
# Create a separate plot for each country with actual vs predicted lines for each model
fig, axs = plt.subplots(2, 2, figsize=(16, 10), sharey=True)
axs = axs.flatten()
for i, country in enumerate(countries_to_plot):
ax = axs[i]
country_data = plot_df[plot_df['Country'] == country]
# Plot Actual values
actual_data = country_data[['Year', 'Actual']].drop_duplicates()
ax.plot(actual_data['Year'], actual_data['Actual'], label='Actual', color='black', marker='o')
# Plot Forecasts from each model
for model in country_data['Model'].unique():
model_data = country_data[country_data['Model'] == model]
ax.plot(model_data['Year'], model_data['Forecast'], label=f'Forecast ({model})', marker='x')
ax.set_title(f'{country} - Actual vs Predicted ({target_of_interest})')
ax.set_xlabel('Year')
ax.set_ylabel('Value')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()
# Plot Acutal vs Predict (RF, ARIMA, Prophet )
# Plots Testing- Actual vs Predict (RF, ARIMA, Prophet) - 18 July
import matplotlib.pyplot as plt
import numpy as np
def plot_target_forecast(df, country, target):
# Filter data for country & target
df_ct = df[(df['Country'] == country) & (df['Target'] == target)].sort_values('Year')
# Extract data
years = df_ct['Year']
arima = df_ct['ARIMA_Forecast']
rf = df_ct['RF_Forecast']
prophet = df_ct['Prophet_Forecast']
# Actual years
actual_years = [2021, 2022, 2023]
forecast_years = list(range(2024, 2075))
actual_mask = df_ct['Year'].isin(actual_years)
# Use ARIMA prediction as proxy for observed if needed
actual_vals = arima[actual_mask]
# Start plot
plt.figure(figsize=(13, 6))
# Forecast region shading
plt.axvspan(2024, 2074, color='gray', alpha=0.12, label='Forecast Horizon')
# Plot forecasts
plt.plot(years, arima, color='forestgreen', linewidth=2, label='ARIMA Forecast')
plt.plot(years, prophet, color='darkorchid', linewidth=2, label='Prophet Forecast')
plt.plot(years, rf, color='navy', linewidth=2, label='Random Forest Forecast')
# Plot actual values
plt.scatter(df_ct.loc[actual_mask, 'Year'], actual_vals,
color='orange', edgecolor='black', s=90,
label='Observed (2021–2023)', zorder=5)
# Final touches
plt.title(f"{target} Forecast — {country}", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Value")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
selected_countries = [
'United States', 'Germany', 'Japan', 'Brazil', 'India',
'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
for country in selected_countries:
for target in ["Life expectancy", "Diabetes", "Cardiovascular diseases"]:
plot_target_forecast(df_model_comparison, country, target)
# Plots Testing- Actual vs Predict (RF, ARIMA, Prophet)
forecast_summary = []
for country in selected_countries:
df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')
for target in target_columns:
if target not in df_country.columns:
continue
features = selected_features_dict.get(target, [])
available_features = [f for f in features if f in df_country.columns]
if not available_features:
continue
df_train = df_country[df_country['Year'].between(start_train, end_train)]
df_eval = df_country[df_country['Year'].isin(eval_years)]
df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
actual_eval = df_eval[target].values
#### ARIMA ####
arima_rmse, arima_forecast_eval, arima_forecast = None, [], []
try:
train_series = df_train[[target]].copy()
train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
model = ARIMA(train_series, order=(1, 1, 1)).fit()
pred_eval_arima = model.predict(start=len(train_series), end=len(train_series)+len(df_eval)-1)
arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval_arima))
arima_forecast_eval = pred_eval_arima.tolist()
arima_forecast = model.predict(start=len(train_series)+len(df_eval),
end=len(train_series)+len(df_eval)+len(df_forecast)-1).tolist()
except:
pass
#### Prophet ####
prophet_rmse, prophet_forecast_eval, prophet_forecast = None, [], []
try:
prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
model = Prophet()
model.fit(prophet_df)
eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
forecast_eval_prophet = model.predict(eval_dates)
prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval_prophet['yhat'].values))
prophet_forecast_eval = forecast_eval_prophet['yhat'].tolist()
forecast_years_df = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
prophet_forecast = model.predict(forecast_years_df)['yhat'].tolist()
except:
pass
#### RF ####
rf_rmse, rf_forecast_eval, rf_forecast = None, [], []
try:
X = df_country[available_features]
y = df_country[target]
X_train = X[df_country['Year'].between(start_train, end_train)]
y_train = y[df_country['Year'].between(start_train, end_train)]
X_eval = X[df_country['Year'].isin(eval_years)]
y_eval = y[df_country['Year'].isin(eval_years)]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred_eval_rf = model.predict(X_eval)
rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval_rf))
rf_forecast_eval = pred_eval_rf.tolist()
X_forecast = X[df_country['Year'].isin(forecast_horizon)]
rf_forecast = model.predict(X_forecast).tolist() if not X_forecast.isnull().any(axis=1).any() else [None]*len(X_forecast)
except:
pass
# Append evaluation predictions
for i, year in enumerate(eval_years):
forecast_summary.append({
"Country": country,
"Target": target,
"Year": year,
"ARIMA_RMSE": arima_rmse,
"ARIMA_Forecast": arima_forecast_eval[i] if i < len(arima_forecast_eval) else None,
"Prophet_RMSE": prophet_rmse,
"Prophet_Forecast": prophet_forecast_eval[i] if i < len(prophet_forecast_eval) else None,
"RF_RMSE": rf_rmse,
"RF_Forecast": rf_forecast_eval[i] if i < len(rf_forecast_eval) else None
})
# Append future forecast predictions
for i, year in enumerate(df_forecast['Year']):
forecast_summary.append({
"Country": country,
"Target": target,
"Year": year,
"ARIMA_RMSE": arima_rmse,
"ARIMA_Forecast": arima_forecast[i] if i < len(arima_forecast) else None,
"Prophet_RMSE": prophet_rmse,
"Prophet_Forecast": prophet_forecast[i] if i < len(prophet_forecast) else None,
"RF_RMSE": rf_rmse,
"RF_Forecast": rf_forecast[i] if i < len(rf_forecast) else None
})
df_model_comparison = pd.DataFrame(forecast_summary).sort_values(["Country", "Target", "Year"])
def plot_target_forecast(df_model_all, df_eval_ready, country, target):
eval_years = [2021, 2022, 2023]
forecast_years = list(range(2024, 2075))
full_years = eval_years + forecast_years
df_actual = df_eval_ready[
(df_eval_ready['Country'] == country) &
(df_eval_ready['Year'].isin(eval_years))
][['Year', target]].sort_values('Year')
df_plot = df_model_all[
(df_model_all['Country'] == country) &
(df_model_all['Target'] == target) &
(df_model_all['Year'].isin(full_years))
].sort_values('Year')
years = df_plot['Year'].values
rf_vals = df_plot['RF_Forecast'].values
arima_vals = df_plot['ARIMA_Forecast'].values
prophet_vals = df_plot['Prophet_Forecast'].values
# Build actual line
actual_line = []
for yr in years:
val = df_actual[df_actual['Year'] == yr][target]
actual_line.append(val.values[0] if not val.empty else np.nan)
# Plot
plt.figure(figsize=(13, 6))
plt.axvspan(2024, 2074, color='gray', alpha=0.12, label='Forecast Horizon')
plt.plot(years, actual_line, label="🟧 Actual", color='orange', linewidth=2)
plt.plot(years, rf_vals, label="🔵 RF Prediction", color='dodgerblue', linewidth=2)
plt.plot(years, arima_vals, label="🟩 ARIMA Prediction", color='forestgreen', linewidth=2)
plt.plot(years, prophet_vals, label="🟣 Prophet Prediction", color='darkorchid', linewidth=2)
plt.title(f"{target} — Actual & Forecast Comparison ({country})", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Value")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
selected_countries = [
'United States', 'Germany', 'Japan', 'Brazil', 'India',
'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
selected_targets = ["Life expectancy", "Diabetes", "Cardiovascular diseases"]
for country in selected_countries:
for target in selected_targets:
plot_target_forecast(df_model_comparison, df_forecast_ready, country, target)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xcn15p9f.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fq7s2wc4.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=56323', 'data', 'file=/tmp/tmprjkocm4m/xcn15p9f.json', 'init=/tmp/tmprjkocm4m/fq7s2wc4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelv0_il2me/prophet_model-20250723150106.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:06 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:08 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3i0cknr4.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n0ji9b3r.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=90929', 'data', 'file=/tmp/tmprjkocm4m/3i0cknr4.json', 'init=/tmp/tmprjkocm4m/n0ji9b3r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg7_3e6i4/prophet_model-20250723150110.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:10 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:10 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/94b6i4ty.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wiku0dsr.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97008', 'data', 'file=/tmp/tmprjkocm4m/94b6i4ty.json', 'init=/tmp/tmprjkocm4m/wiku0dsr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelh4lwn385/prophet_model-20250723150111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:11 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:12 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qg7p85e3.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4sctqmvy.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=44341', 'data', 'file=/tmp/tmprjkocm4m/qg7p85e3.json', 'init=/tmp/tmprjkocm4m/4sctqmvy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelw54y1lrb/prophet_model-20250723150112.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:12 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jhcw5fgj.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gee4nwbv.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=89937', 'data', 'file=/tmp/tmprjkocm4m/jhcw5fgj.json', 'init=/tmp/tmprjkocm4m/gee4nwbv.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpy6sfrua/prophet_model-20250723150113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:13 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:13 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hya61vxx.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gqd80ecs.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=95356', 'data', 'file=/tmp/tmprjkocm4m/hya61vxx.json', 'init=/tmp/tmprjkocm4m/gqd80ecs.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltqfulq0n/prophet_model-20250723150114.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:14 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:14 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ko5xb_ld.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/41l5k7jf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6632', 'data', 'file=/tmp/tmprjkocm4m/ko5xb_ld.json', 'init=/tmp/tmprjkocm4m/41l5k7jf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwh4bbhju/prophet_model-20250723150115.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:15 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:15 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/brryefbv.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/79kzkg23.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45981', 'data', 'file=/tmp/tmprjkocm4m/brryefbv.json', 'init=/tmp/tmprjkocm4m/79kzkg23.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8a83zuam/prophet_model-20250723150115.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:15 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1vtpzvl4.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zo3hc0yn.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27575', 'data', 'file=/tmp/tmprjkocm4m/1vtpzvl4.json', 'init=/tmp/tmprjkocm4m/zo3hc0yn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelyvq_dlwd/prophet_model-20250723150116.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:16 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:16 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e7rkf5ty.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/iqbldoj4.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39930', 'data', 'file=/tmp/tmprjkocm4m/e7rkf5ty.json', 'init=/tmp/tmprjkocm4m/iqbldoj4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2c3lh1fk/prophet_model-20250723150117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:17 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:17 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yq_o5a0j.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q586xxt7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=49176', 'data', 'file=/tmp/tmprjkocm4m/yq_o5a0j.json', 'init=/tmp/tmprjkocm4m/q586xxt7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model47zxobpr/prophet_model-20250723150117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:17 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:18 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4a97gqcs.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b2zn8__y.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25471', 'data', 'file=/tmp/tmprjkocm4m/4a97gqcs.json', 'init=/tmp/tmprjkocm4m/b2zn8__y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_33hc8lw/prophet_model-20250723150118.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:18 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:18 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/buq0wfdv.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/en47nn7i.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33930', 'data', 'file=/tmp/tmprjkocm4m/buq0wfdv.json', 'init=/tmp/tmprjkocm4m/en47nn7i.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model05mkl102/prophet_model-20250723150119.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:19 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:19 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6k7ywp4j.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/s5jjzobg.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4964', 'data', 'file=/tmp/tmprjkocm4m/6k7ywp4j.json', 'init=/tmp/tmprjkocm4m/s5jjzobg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_qx7pexn/prophet_model-20250723150120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:20 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:20 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4tykcuex.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4uz180rf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19771', 'data', 'file=/tmp/tmprjkocm4m/4tykcuex.json', 'init=/tmp/tmprjkocm4m/4uz180rf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleq3wjnzq/prophet_model-20250723150121.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:21 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:21 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dub9k5jc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a_8fsjbb.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27947', 'data', 'file=/tmp/tmprjkocm4m/dub9k5jc.json', 'init=/tmp/tmprjkocm4m/a_8fsjbb.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8cz227c3/prophet_model-20250723150122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:22 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:23 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e2sg94bz.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ma6e3h6o.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=59076', 'data', 'file=/tmp/tmprjkocm4m/e2sg94bz.json', 'init=/tmp/tmprjkocm4m/ma6e3h6o.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellx89hbtk/prophet_model-20250723150123.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:23 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:24 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2foc1vzw.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3p1hzrs8.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25301', 'data', 'file=/tmp/tmprjkocm4m/2foc1vzw.json', 'init=/tmp/tmprjkocm4m/3p1hzrs8.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnfinlbmz/prophet_model-20250723150124.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:24 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:24 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4w6q0v9n.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ttzzynt3.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=40853', 'data', 'file=/tmp/tmprjkocm4m/4w6q0v9n.json', 'init=/tmp/tmprjkocm4m/ttzzynt3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5g2l9frw/prophet_model-20250723150125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:25 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q6sitoku.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jxnz0p82.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45329', 'data', 'file=/tmp/tmprjkocm4m/q6sitoku.json', 'init=/tmp/tmprjkocm4m/jxnz0p82.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelbomqvztx/prophet_model-20250723150125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:25 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sxj4mx1i.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j3ff4el7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55135', 'data', 'file=/tmp/tmprjkocm4m/sxj4mx1i.json', 'init=/tmp/tmprjkocm4m/j3ff4el7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0a2p2g90/prophet_model-20250723150126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:26 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:26 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gpjbd4kf.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4be0jlhe.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=64708', 'data', 'file=/tmp/tmprjkocm4m/gpjbd4kf.json', 'init=/tmp/tmprjkocm4m/4be0jlhe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj_13ox_d/prophet_model-20250723150127.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:27 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:27 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tyabzpsb.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dsimj0a7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75251', 'data', 'file=/tmp/tmprjkocm4m/tyabzpsb.json', 'init=/tmp/tmprjkocm4m/dsimj0a7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model416aph0l/prophet_model-20250723150128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:28 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:28 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8eok89ec.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ngwetz9c.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13514', 'data', 'file=/tmp/tmprjkocm4m/8eok89ec.json', 'init=/tmp/tmprjkocm4m/ngwetz9c.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model6ra7svlu/prophet_model-20250723150128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:28 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:29 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ctysfzxc.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/py2f0mie.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79130', 'data', 'file=/tmp/tmprjkocm4m/ctysfzxc.json', 'init=/tmp/tmprjkocm4m/py2f0mie.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelfzuldb6m/prophet_model-20250723150129.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:29 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sorq_sl7.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3cpdd1hk.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48566', 'data', 'file=/tmp/tmprjkocm4m/sorq_sl7.json', 'init=/tmp/tmprjkocm4m/3cpdd1hk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5qngdowd/prophet_model-20250723150130.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:30 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:30 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ts10rp61.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9xnd_9dl.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=2376', 'data', 'file=/tmp/tmprjkocm4m/ts10rp61.json', 'init=/tmp/tmprjkocm4m/9xnd_9dl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4cm1tcdf/prophet_model-20250723150131.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:31 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:31 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ki_scwy0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pvgasxyf.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23559', 'data', 'file=/tmp/tmprjkocm4m/ki_scwy0.json', 'init=/tmp/tmprjkocm4m/pvgasxyf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzzp4uuj8/prophet_model-20250723150131.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:31 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:32 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jg311jj0.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2bsjz2e7.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=9497', 'data', 'file=/tmp/tmprjkocm4m/jg311jj0.json', 'init=/tmp/tmprjkocm4m/2bsjz2e7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelqgx6y8ti/prophet_model-20250723150132.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:32 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:32 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v5h3o2i9.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mpabi3zd.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27575', 'data', 'file=/tmp/tmprjkocm4m/v5h3o2i9.json', 'init=/tmp/tmprjkocm4m/mpabi3zd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj5nlrp04/prophet_model-20250723150133.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 15:01:33 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 15:01:33 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing